2 år sedan · 696346f4cc
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@@ -141,6 +141,11 @@ Comment: AMD FidelityFX Super Resolution
 
				 Copyright: 2021, Advanced Micro Devices, Inc.
			
 
				 License: Expat
			
 
				 
			
 
				+Files: ./thirdparty/astcenc/
			
 
				+Comment: Arm ASTC Encoder
			
 
				+Copyright: 2011-2023, Arm Limited
			
 
				+License: Apache-2.0
			
 
				+
			
 
				 Files: ./thirdparty/basis_universal/
			
 
				 Comment: Basis Universal
			
 
				 Copyright: 2022, Binomial LLC.
			
--- a/modules/astcenc/SCsub
+++ b/modules/astcenc/SCsub
@@ -0,0 +1,55 @@
 
				+#!/usr/bin/env python
			
 
				+
			
 
				+Import("env")
			
 
				+Import("env_modules")
			
 
				+
			
 
				+env_astcenc = env_modules.Clone()
			
 
				+
			
 
				+# Thirdparty source files
			
 
				+
			
 
				+thirdparty_obj = []
			
 
				+
			
 
				+thirdparty_dir = "#thirdparty/astcenc/"
			
 
				+thirdparty_sources = [
			
 
				+    "astcenc_averages_and_directions.cpp",
			
 
				+    "astcenc_block_sizes.cpp",
			
 
				+    "astcenc_color_quantize.cpp",
			
 
				+    "astcenc_color_unquantize.cpp",
			
 
				+    "astcenc_compress_symbolic.cpp",
			
 
				+    "astcenc_compute_variance.cpp",
			
 
				+    "astcenc_decompress_symbolic.cpp",
			
 
				+    "astcenc_diagnostic_trace.cpp",
			
 
				+    "astcenc_entry.cpp",
			
 
				+    "astcenc_find_best_partitioning.cpp",
			
 
				+    "astcenc_ideal_endpoints_and_weights.cpp",
			
 
				+    "astcenc_image.cpp",
			
 
				+    "astcenc_integer_sequence.cpp",
			
 
				+    "astcenc_mathlib.cpp",
			
 
				+    "astcenc_mathlib_softfloat.cpp",
			
 
				+    "astcenc_partition_tables.cpp",
			
 
				+    "astcenc_percentile_tables.cpp",
			
 
				+    "astcenc_pick_best_endpoint_format.cpp",
			
 
				+    "astcenc_platform_isa_detection.cpp",
			
 
				+    "astcenc_quantization.cpp",
			
 
				+    "astcenc_symbolic_physical.cpp",
			
 
				+    "astcenc_weight_align.cpp",
			
 
				+    "astcenc_weight_quant_xfer_tables.cpp",
			
 
				+]
			
 
				+thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
			
 
				+
			
 
				+env_astcenc.Prepend(CPPPATH=[thirdparty_dir])
			
 
				+
			
 
				+env_thirdparty = env_astcenc.Clone()
			
 
				+env_thirdparty.disable_warnings()
			
 
				+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
			
 
				+env.modules_sources += thirdparty_obj
			
 
				+
			
 
				+# Godot source files
			
 
				+
			
 
				+module_obj = []
			
 
				+
			
 
				+env_astcenc.add_source_files(module_obj, "*.cpp")
			
 
				+env.modules_sources += module_obj
			
 
				+
			
 
				+# Needed to force rebuilding the module files when the thirdparty library is updated.
			
 
				+env.Depends(module_obj, thirdparty_obj)
			
--- a/modules/astcenc/config.py
+++ b/modules/astcenc/config.py
@@ -0,0 +1,6 @@
 
				+def can_build(env, platform):
			
 
				+    return env.editor_build
			
 
				+
			
 
				+
			
 
				+def configure(env):
			
 
				+    pass
			
--- a/modules/astcenc/image_compress_astcenc.cpp
+++ b/modules/astcenc/image_compress_astcenc.cpp
@@ -0,0 +1,251 @@
 
				+/**************************************************************************/
			
 
				+/*  image_compress_astcenc.cpp                                            */
			
 
				+/**************************************************************************/
			
 
				+/*                         This file is part of:                          */
			
 
				+/*                             GODOT ENGINE                               */
			
 
				+/*                        https://godotengine.org                         */
			
 
				+/**************************************************************************/
			
 
				+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
			
 
				+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
			
 
				+/*                                                                        */
			
 
				+/* Permission is hereby granted, free of charge, to any person obtaining  */
			
 
				+/* a copy of this software and associated documentation files (the        */
			
 
				+/* "Software"), to deal in the Software without restriction, including    */
			
 
				+/* without limitation the rights to use, copy, modify, merge, publish,    */
			
 
				+/* distribute, sublicense, and/or sell copies of the Software, and to     */
			
 
				+/* permit persons to whom the Software is furnished to do so, subject to  */
			
 
				+/* the following conditions:                                              */
			
 
				+/*                                                                        */
			
 
				+/* The above copyright notice and this permission notice shall be         */
			
 
				+/* included in all copies or substantial portions of the Software.        */
			
 
				+/*                                                                        */
			
 
				+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
			
 
				+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
			
 
				+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
			
 
				+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
			
 
				+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
			
 
				+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
			
 
				+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
			
 
				+/**************************************************************************/
			
 
				+
			
 
				+#include "image_compress_astcenc.h"
			
 
				+
			
 
				+#include "core/os/os.h"
			
 
				+#include "core/string/print_string.h"
			
 
				+
			
 
				+#include <astcenc.h>
			
 
				+
			
 
				+void _compress_astc(Image *r_img, float p_lossy_quality, Image::ASTCFormat p_format) {
			
 
				+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
			
 
				+
			
 
				+	// TODO: See how to handle lossy quality.
			
 
				+
			
 
				+	Image::Format img_format = r_img->get_format();
			
 
				+	if (img_format >= Image::FORMAT_DXT1) {
			
 
				+		return; // Do not compress, already compressed.
			
 
				+	}
			
 
				+
			
 
				+	bool is_hdr = false;
			
 
				+	if ((img_format >= Image::FORMAT_RH) && (img_format <= Image::FORMAT_RGBE9995)) {
			
 
				+		is_hdr = true;
			
 
				+		r_img->convert(Image::FORMAT_RGBAF);
			
 
				+	} else {
			
 
				+		r_img->convert(Image::FORMAT_RGBA8);
			
 
				+	}
			
 
				+
			
 
				+	// Determine encoder output format from our enum.
			
 
				+
			
 
				+	Image::Format target_format = Image::FORMAT_RGBA8;
			
 
				+	astcenc_profile profile = ASTCENC_PRF_LDR;
			
 
				+	unsigned int block_x = 4;
			
 
				+	unsigned int block_y = 4;
			
 
				+
			
 
				+	if (p_format == Image::ASTCFormat::ASTC_FORMAT_4x4) {
			
 
				+		if (is_hdr) {
			
 
				+			target_format = Image::FORMAT_ASTC_4x4_HDR;
			
 
				+			profile = ASTCENC_PRF_HDR;
			
 
				+		} else {
			
 
				+			target_format = Image::FORMAT_ASTC_4x4;
			
 
				+		}
			
 
				+	} else if (p_format == Image::ASTCFormat::ASTC_FORMAT_8x8) {
			
 
				+		if (is_hdr) {
			
 
				+			target_format = Image::FORMAT_ASTC_8x8_HDR;
			
 
				+			profile = ASTCENC_PRF_HDR;
			
 
				+		} else {
			
 
				+			target_format = Image::FORMAT_ASTC_8x8;
			
 
				+		}
			
 
				+		block_x = 8;
			
 
				+		block_y = 8;
			
 
				+	}
			
 
				+
			
 
				+	// Compress image data and (if required) mipmaps.
			
 
				+
			
 
				+	const bool mipmaps = r_img->has_mipmaps();
			
 
				+	int width = r_img->get_width();
			
 
				+	int height = r_img->get_height();
			
 
				+
			
 
				+	print_verbose(vformat("astcenc: Encoding image size %dx%d to format %s%s.", width, height, Image::get_format_name(target_format), mipmaps ? ", with mipmaps" : ""));
			
 
				+
			
 
				+	// Initialize astcenc.
			
 
				+
			
 
				+	astcenc_config config;
			
 
				+	config.block_x = block_x;
			
 
				+	config.block_y = block_y;
			
 
				+	config.profile = profile;
			
 
				+	const float quality = ASTCENC_PRE_MEDIUM;
			
 
				+
			
 
				+	astcenc_error status = astcenc_config_init(profile, block_x, block_y, block_x, quality, 0, &config);
			
 
				+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
			
 
				+			vformat("astcenc: Configuration initialization failed: %s.", astcenc_get_error_string(status)));
			
 
				+
			
 
				+	// Context allocation.
			
 
				+
			
 
				+	astcenc_context *context;
			
 
				+	const unsigned int thread_count = OS::get_singleton()->get_processor_count();
			
 
				+
			
 
				+	status = astcenc_context_alloc(&config, thread_count, &context);
			
 
				+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
			
 
				+			vformat("astcenc: Context allocation failed: %s.", astcenc_get_error_string(status)));
			
 
				+
			
 
				+	// Compress image.
			
 
				+
			
 
				+	Vector<uint8_t> image_data = r_img->get_data();
			
 
				+	uint8_t *slices = image_data.ptrw();
			
 
				+
			
 
				+	astcenc_image image;
			
 
				+	image.dim_x = width;
			
 
				+	image.dim_y = height;
			
 
				+	image.dim_z = 1;
			
 
				+	image.data_type = ASTCENC_TYPE_U8;
			
 
				+	if (is_hdr) {
			
 
				+		image.data_type = ASTCENC_TYPE_F32;
			
 
				+	}
			
 
				+	image.data = reinterpret_cast<void **>(&slices);
			
 
				+
			
 
				+	// Compute the number of ASTC blocks in each dimension.
			
 
				+	unsigned int block_count_x = (width + block_x - 1) / block_x;
			
 
				+	unsigned int block_count_y = (height + block_y - 1) / block_y;
			
 
				+	size_t comp_len = block_count_x * block_count_y * 16;
			
 
				+
			
 
				+	Vector<uint8_t> compressed_data;
			
 
				+	compressed_data.resize(comp_len);
			
 
				+	compressed_data.fill(0);
			
 
				+
			
 
				+	const astcenc_swizzle swizzle = {
			
 
				+		ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
			
 
				+	};
			
 
				+
			
 
				+	status = astcenc_compress_image(context, &image, &swizzle, compressed_data.ptrw(), comp_len, 0);
			
 
				+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
			
 
				+			vformat("astcenc: ASTC image compression failed: %s.", astcenc_get_error_string(status)));
			
 
				+
			
 
				+	// Replace original image with compressed one.
			
 
				+
			
 
				+	r_img->set_data(width, height, mipmaps, target_format, compressed_data);
			
 
				+
			
 
				+	print_verbose(vformat("astcenc: Encoding took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
			
 
				+}
			
 
				+
			
 
				+void _decompress_astc(Image *r_img) {
			
 
				+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
			
 
				+
			
 
				+	// Determine decompression parameters from image format.
			
 
				+
			
 
				+	Image::Format img_format = r_img->get_format();
			
 
				+	bool is_hdr = false;
			
 
				+	unsigned int block_x = 0;
			
 
				+	unsigned int block_y = 0;
			
 
				+	if (img_format == Image::FORMAT_ASTC_4x4) {
			
 
				+		block_x = 4;
			
 
				+		block_y = 4;
			
 
				+		is_hdr = false;
			
 
				+	} else if (img_format == Image::FORMAT_ASTC_4x4_HDR) {
			
 
				+		block_x = 4;
			
 
				+		block_y = 4;
			
 
				+		is_hdr = true;
			
 
				+	} else if (img_format == Image::FORMAT_ASTC_8x8) {
			
 
				+		block_x = 8;
			
 
				+		block_y = 8;
			
 
				+		is_hdr = false;
			
 
				+	} else if (img_format == Image::FORMAT_ASTC_8x8_HDR) {
			
 
				+		block_x = 8;
			
 
				+		block_y = 8;
			
 
				+		is_hdr = true;
			
 
				+	} else {
			
 
				+		ERR_FAIL_MSG("astcenc: Cannot decompress Image with a non-ASTC format.");
			
 
				+	}
			
 
				+
			
 
				+	// Initialize astcenc.
			
 
				+
			
 
				+	astcenc_profile profile = ASTCENC_PRF_LDR;
			
 
				+	if (is_hdr) {
			
 
				+		profile = ASTCENC_PRF_HDR;
			
 
				+	}
			
 
				+	astcenc_config config;
			
 
				+	const float quality = ASTCENC_PRE_MEDIUM;
			
 
				+
			
 
				+	astcenc_error status = astcenc_config_init(profile, block_x, block_y, block_x, quality, 0, &config);
			
 
				+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
			
 
				+			vformat("astcenc: Configuration initialization failed: %s.", astcenc_get_error_string(status)));
			
 
				+
			
 
				+	// Context allocation.
			
 
				+
			
 
				+	astcenc_context *context = nullptr;
			
 
				+	const unsigned int thread_count = OS::get_singleton()->get_processor_count();
			
 
				+
			
 
				+	status = astcenc_context_alloc(&config, thread_count, &context);
			
 
				+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
			
 
				+			vformat("astcenc: Context allocation failed: %s.", astcenc_get_error_string(status)));
			
 
				+
			
 
				+	// Decompress image.
			
 
				+
			
 
				+	const bool mipmaps = r_img->has_mipmaps();
			
 
				+	int width = r_img->get_width();
			
 
				+	int height = r_img->get_height();
			
 
				+
			
 
				+	astcenc_image image;
			
 
				+	image.dim_x = width;
			
 
				+	image.dim_y = height;
			
 
				+	image.dim_z = 1;
			
 
				+	image.data_type = ASTCENC_TYPE_U8;
			
 
				+	Image::Format target_format = Image::FORMAT_RGBA8;
			
 
				+	if (is_hdr) {
			
 
				+		target_format = Image::FORMAT_RGBAF;
			
 
				+		image.data_type = ASTCENC_TYPE_F32;
			
 
				+	}
			
 
				+
			
 
				+	Vector<uint8_t> image_data = r_img->get_data();
			
 
				+
			
 
				+	Vector<uint8_t> new_image_data;
			
 
				+	new_image_data.resize(Image::get_image_data_size(width, height, target_format, false));
			
 
				+	new_image_data.fill(0);
			
 
				+	uint8_t *slices = new_image_data.ptrw();
			
 
				+	image.data = reinterpret_cast<void **>(&slices);
			
 
				+
			
 
				+	const astcenc_swizzle swizzle = {
			
 
				+		ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
			
 
				+	};
			
 
				+
			
 
				+	status = astcenc_decompress_image(context, image_data.ptr(), image_data.size(), &image, &swizzle, 0);
			
 
				+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
			
 
				+			vformat("astcenc: ASTC decompression failed: %s.", astcenc_get_error_string(status)));
			
 
				+	ERR_FAIL_COND_MSG(image.dim_z > 1,
			
 
				+			"astcenc: ASTC decompression failed because this is a 3D texture, which is not supported.");
			
 
				+
			
 
				+	// Replace original image with compressed one.
			
 
				+
			
 
				+	Image::Format image_format = Image::FORMAT_RGBA8;
			
 
				+	if (image.data_type == ASTCENC_TYPE_F32) {
			
 
				+		image_format = Image::FORMAT_RGBAF;
			
 
				+	} else if (image.data_type == ASTCENC_TYPE_U8) {
			
 
				+		image_format = Image::FORMAT_RGBA8;
			
 
				+	} else if (image.data_type == ASTCENC_TYPE_F16) {
			
 
				+		image_format = Image::FORMAT_RGBAH;
			
 
				+	} else {
			
 
				+		ERR_FAIL_MSG("astcenc: ASTC decompression failed with an unknown format.");
			
 
				+	}
			
 
				+
			
 
				+	r_img->set_data(image.dim_x, image.dim_y, mipmaps, image_format, new_image_data);
			
 
				+
			
 
				+	print_verbose(vformat("astcenc: Decompression took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
			
 
				+}
			
--- a/modules/astcenc/image_compress_astcenc.h
+++ b/modules/astcenc/image_compress_astcenc.h
@@ -0,0 +1,39 @@
 
				+/**************************************************************************/
			
 
				+/*  image_compress_astcenc.h                                              */
			
 
				+/**************************************************************************/
			
 
				+/*                         This file is part of:                          */
			
 
				+/*                             GODOT ENGINE                               */
			
 
				+/*                        https://godotengine.org                         */
			
 
				+/**************************************************************************/
			
 
				+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
			
 
				+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
			
 
				+/*                                                                        */
			
 
				+/* Permission is hereby granted, free of charge, to any person obtaining  */
			
 
				+/* a copy of this software and associated documentation files (the        */
			
 
				+/* "Software"), to deal in the Software without restriction, including    */
			
 
				+/* without limitation the rights to use, copy, modify, merge, publish,    */
			
 
				+/* distribute, sublicense, and/or sell copies of the Software, and to     */
			
 
				+/* permit persons to whom the Software is furnished to do so, subject to  */
			
 
				+/* the following conditions:                                              */
			
 
				+/*                                                                        */
			
 
				+/* The above copyright notice and this permission notice shall be         */
			
 
				+/* included in all copies or substantial portions of the Software.        */
			
 
				+/*                                                                        */
			
 
				+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
			
 
				+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
			
 
				+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
			
 
				+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
			
 
				+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
			
 
				+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
			
 
				+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
			
 
				+/**************************************************************************/
			
 
				+
			
 
				+#ifndef IMAGE_COMPRESS_ASTCENC_H
			
 
				+#define IMAGE_COMPRESS_ASTCENC_H
			
 
				+
			
 
				+#include "core/io/image.h"
			
 
				+
			
 
				+void _compress_astc(Image *r_img, float p_lossy_quality, Image::ASTCFormat p_format);
			
 
				+void _decompress_astc(Image *r_img);
			
 
				+
			
 
				+#endif // IMAGE_COMPRESS_ASTCENC_H
			
--- a/modules/astcenc/register_types.cpp
+++ b/modules/astcenc/register_types.cpp
@@ -0,0 +1,48 @@
 
				+/**************************************************************************/
			
 
				+/*  register_types.cpp                                                    */
			
 
				+/**************************************************************************/
			
 
				+/*                         This file is part of:                          */
			
 
				+/*                             GODOT ENGINE                               */
			
 
				+/*                        https://godotengine.org                         */
			
 
				+/**************************************************************************/
			
 
				+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
			
 
				+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
			
 
				+/*                                                                        */
			
 
				+/* Permission is hereby granted, free of charge, to any person obtaining  */
			
 
				+/* a copy of this software and associated documentation files (the        */
			
 
				+/* "Software"), to deal in the Software without restriction, including    */
			
 
				+/* without limitation the rights to use, copy, modify, merge, publish,    */
			
 
				+/* distribute, sublicense, and/or sell copies of the Software, and to     */
			
 
				+/* permit persons to whom the Software is furnished to do so, subject to  */
			
 
				+/* the following conditions:                                              */
			
 
				+/*                                                                        */
			
 
				+/* The above copyright notice and this permission notice shall be         */
			
 
				+/* included in all copies or substantial portions of the Software.        */
			
 
				+/*                                                                        */
			
 
				+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
			
 
				+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
			
 
				+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
			
 
				+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
			
 
				+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
			
 
				+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
			
 
				+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
			
 
				+/**************************************************************************/
			
 
				+
			
 
				+#include "register_types.h"
			
 
				+
			
 
				+#include "image_compress_astcenc.h"
			
 
				+
			
 
				+void initialize_astcenc_module(ModuleInitializationLevel p_level) {
			
 
				+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	Image::_image_compress_astc_func = _compress_astc;
			
 
				+	Image::_image_decompress_astc = _decompress_astc;
			
 
				+}
			
 
				+
			
 
				+void uninitialize_astcenc_module(ModuleInitializationLevel p_level) {
			
 
				+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
			
 
				+		return;
			
 
				+	}
			
 
				+}
			
--- a/modules/astcenc/register_types.h
+++ b/modules/astcenc/register_types.h
@@ -0,0 +1,39 @@
 
				+/**************************************************************************/
			
 
				+/*  register_types.h                                                      */
			
 
				+/**************************************************************************/
			
 
				+/*                         This file is part of:                          */
			
 
				+/*                             GODOT ENGINE                               */
			
 
				+/*                        https://godotengine.org                         */
			
 
				+/**************************************************************************/
			
 
				+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
			
 
				+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
			
 
				+/*                                                                        */
			
 
				+/* Permission is hereby granted, free of charge, to any person obtaining  */
			
 
				+/* a copy of this software and associated documentation files (the        */
			
 
				+/* "Software"), to deal in the Software without restriction, including    */
			
 
				+/* without limitation the rights to use, copy, modify, merge, publish,    */
			
 
				+/* distribute, sublicense, and/or sell copies of the Software, and to     */
			
 
				+/* permit persons to whom the Software is furnished to do so, subject to  */
			
 
				+/* the following conditions:                                              */
			
 
				+/*                                                                        */
			
 
				+/* The above copyright notice and this permission notice shall be         */
			
 
				+/* included in all copies or substantial portions of the Software.        */
			
 
				+/*                                                                        */
			
 
				+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
			
 
				+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
			
 
				+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
			
 
				+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
			
 
				+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
			
 
				+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
			
 
				+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
			
 
				+/**************************************************************************/
			
 
				+
			
 
				+#ifndef ASTCENC_REGISTER_TYPES_H
			
 
				+#define ASTCENC_REGISTER_TYPES_H
			
 
				+
			
 
				+#include "modules/register_module_types.h"
			
 
				+
			
 
				+void initialize_astcenc_module(ModuleInitializationLevel p_level);
			
 
				+void uninitialize_astcenc_module(ModuleInitializationLevel p_level);
			
 
				+
			
 
				+#endif // ASTCENC_REGISTER_TYPES_H
			
--- a/modules/etcpak/image_compress_etcpak.cpp
+++ b/modules/etcpak/image_compress_etcpak.cpp
@@ -33,8 +33,8 @@
 
				 #include "core/os/os.h"
			
 
				 #include "core/string/print_string.h"
			
 
				 
			
 
				-#include "thirdparty/etcpak/ProcessDxtc.hpp"
			
 
				-#include "thirdparty/etcpak/ProcessRGB.hpp"
			
 
				+#include <ProcessDxtc.hpp>
			
 
				+#include <ProcessRGB.hpp>
			
 
				 
			
 
				 EtcpakType _determine_etc_type(Image::UsedChannels p_channels) {
			
 
				 	switch (p_channels) {
			
@@ -130,7 +130,7 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
 
				 	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5) {
			
 
				 		target_format = Image::FORMAT_DXT5;
			
 
				 	} else {
			
 
				-		ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format.");
			
 
				+		ERR_FAIL_MSG("Invalid or unsupported etcpak compression format, not ETC or DXT.");
			
 
				 	}
			
 
				 
			
 
				 	// Compress image data and (if required) mipmaps.
			
@@ -171,7 +171,7 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
 
				 
			
 
				 	const uint8_t *src_read = r_img->get_data().ptr();
			
 
				 
			
 
				-	print_verbose(vformat("ETCPAK: Encoding image size %dx%d to format %s.", width, height, Image::get_format_name(target_format)));
			
 
				+	print_verbose(vformat("etcpak: Encoding image size %dx%d to format %s%s.", width, height, Image::get_format_name(target_format), mipmaps ? ", with mipmaps" : ""));
			
 
				 
			
 
				 	int dest_size = Image::get_image_data_size(width, height, target_format, mipmaps);
			
 
				 	Vector<uint8_t> dest_data;
			
@@ -232,12 +232,12 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
 
				 		} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5 || p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG) {
			
 
				 			CompressDxt5(src_mip_read, dest_mip_write, blocks, mip_w);
			
 
				 		} else {
			
 
				-			ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format.");
			
 
				+			ERR_FAIL_MSG("etcpak: Invalid or unsupported compression format.");
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	// Replace original image with compressed one.
			
 
				 	r_img->set_data(width, height, mipmaps, target_format, dest_data);
			
 
				 
			
 
				-	print_verbose(vformat("ETCPAK encode took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
			
 
				+	print_verbose(vformat("etcpak: Encoding took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
			
 
				 }
			
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -17,6 +17,18 @@ Files extracted from upstream source:
 
				 - `license.txt`
			
 
				 
			
 
				 
			
 
				+## astcenc
			
 
				+
			
 
				+- Upstream: https://github.com/ARM-software/astc-encoder
			
 
				+- Version: 4.3.0 (ec83dda79fcefe07f69cdae7ed980d169bf2c4d4, 2023)
			
 
				+- License: Apache 2.0
			
 
				+
			
 
				+Files extracted from upstream source:
			
 
				+
			
 
				+- `astcenc_*` and `astcenc.h` files from `Source`
			
 
				+- `LICENSE.txt`
			
 
				+
			
 
				+
			
 
				 ## basis_universal
			
 
				 
			
 
				 - Upstream: https://github.com/BinomialLLC/basis_universal
			
--- a/thirdparty/astcenc/LICENSE.txt
+++ b/thirdparty/astcenc/LICENSE.txt
@@ -0,0 +1,175 @@
 
				+
			
 
				+                                 Apache License
			
 
				+                           Version 2.0, January 2004
			
 
				+                        http://www.apache.org/licenses/
			
 
				+
			
 
				+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
			
 
				+
			
 
				+   1. Definitions.
			
 
				+
			
 
				+      "License" shall mean the terms and conditions for use, reproduction,
			
 
				+      and distribution as defined by Sections 1 through 9 of this document.
			
 
				+
			
 
				+      "Licensor" shall mean the copyright owner or entity authorized by
			
 
				+      the copyright owner that is granting the License.
			
 
				+
			
 
				+      "Legal Entity" shall mean the union of the acting entity and all
			
 
				+      other entities that control, are controlled by, or are under common
			
 
				+      control with that entity. For the purposes of this definition,
			
 
				+      "control" means (i) the power, direct or indirect, to cause the
			
 
				+      direction or management of such entity, whether by contract or
			
 
				+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
			
 
				+      outstanding shares, or (iii) beneficial ownership of such entity.
			
 
				+
			
 
				+      "You" (or "Your") shall mean an individual or Legal Entity
			
 
				+      exercising permissions granted by this License.
			
 
				+
			
 
				+      "Source" form shall mean the preferred form for making modifications,
			
 
				+      including but not limited to software source code, documentation
			
 
				+      source, and configuration files.
			
 
				+
			
 
				+      "Object" form shall mean any form resulting from mechanical
			
 
				+      transformation or translation of a Source form, including but
			
 
				+      not limited to compiled object code, generated documentation,
			
 
				+      and conversions to other media types.
			
 
				+
			
 
				+      "Work" shall mean the work of authorship, whether in Source or
			
 
				+      Object form, made available under the License, as indicated by a
			
 
				+      copyright notice that is included in or attached to the work
			
 
				+      (an example is provided in the Appendix below).
			
 
				+
			
 
				+      "Derivative Works" shall mean any work, whether in Source or Object
			
 
				+      form, that is based on (or derived from) the Work and for which the
			
 
				+      editorial revisions, annotations, elaborations, or other modifications
			
 
				+      represent, as a whole, an original work of authorship. For the purposes
			
 
				+      of this License, Derivative Works shall not include works that remain
			
 
				+      separable from, or merely link (or bind by name) to the interfaces of,
			
 
				+      the Work and Derivative Works thereof.
			
 
				+
			
 
				+      "Contribution" shall mean any work of authorship, including
			
 
				+      the original version of the Work and any modifications or additions
			
 
				+      to that Work or Derivative Works thereof, that is intentionally
			
 
				+      submitted to Licensor for inclusion in the Work by the copyright owner
			
 
				+      or by an individual or Legal Entity authorized to submit on behalf of
			
 
				+      the copyright owner. For the purposes of this definition, "submitted"
			
 
				+      means any form of electronic, verbal, or written communication sent
			
 
				+      to the Licensor or its representatives, including but not limited to
			
 
				+      communication on electronic mailing lists, source code control systems,
			
 
				+      and issue tracking systems that are managed by, or on behalf of, the
			
 
				+      Licensor for the purpose of discussing and improving the Work, but
			
 
				+      excluding communication that is conspicuously marked or otherwise
			
 
				+      designated in writing by the copyright owner as "Not a Contribution."
			
 
				+
			
 
				+      "Contributor" shall mean Licensor and any individual or Legal Entity
			
 
				+      on behalf of whom a Contribution has been received by Licensor and
			
 
				+      subsequently incorporated within the Work.
			
 
				+
			
 
				+   2. Grant of Copyright License. Subject to the terms and conditions of
			
 
				+      this License, each Contributor hereby grants to You a perpetual,
			
 
				+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+      copyright license to reproduce, prepare Derivative Works of,
			
 
				+      publicly display, publicly perform, sublicense, and distribute the
			
 
				+      Work and such Derivative Works in Source or Object form.
			
 
				+
			
 
				+   3. Grant of Patent License. Subject to the terms and conditions of
			
 
				+      this License, each Contributor hereby grants to You a perpetual,
			
 
				+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+      (except as stated in this section) patent license to make, have made,
			
 
				+      use, offer to sell, sell, import, and otherwise transfer the Work,
			
 
				+      where such license applies only to those patent claims licensable
			
 
				+      by such Contributor that are necessarily infringed by their
			
 
				+      Contribution(s) alone or by combination of their Contribution(s)
			
 
				+      with the Work to which such Contribution(s) was submitted. If You
			
 
				+      institute patent litigation against any entity (including a
			
 
				+      cross-claim or counterclaim in a lawsuit) alleging that the Work
			
 
				+      or a Contribution incorporated within the Work constitutes direct
			
 
				+      or contributory patent infringement, then any patent licenses
			
 
				+      granted to You under this License for that Work shall terminate
			
 
				+      as of the date such litigation is filed.
			
 
				+
			
 
				+   4. Redistribution. You may reproduce and distribute copies of the
			
 
				+      Work or Derivative Works thereof in any medium, with or without
			
 
				+      modifications, and in Source or Object form, provided that You
			
 
				+      meet the following conditions:
			
 
				+
			
 
				+      (a) You must give any other recipients of the Work or
			
 
				+          Derivative Works a copy of this License; and
			
 
				+
			
 
				+      (b) You must cause any modified files to carry prominent notices
			
 
				+          stating that You changed the files; and
			
 
				+
			
 
				+      (c) You must retain, in the Source form of any Derivative Works
			
 
				+          that You distribute, all copyright, patent, trademark, and
			
 
				+          attribution notices from the Source form of the Work,
			
 
				+          excluding those notices that do not pertain to any part of
			
 
				+          the Derivative Works; and
			
 
				+
			
 
				+      (d) If the Work includes a "NOTICE" text file as part of its
			
 
				+          distribution, then any Derivative Works that You distribute must
			
 
				+          include a readable copy of the attribution notices contained
			
 
				+          within such NOTICE file, excluding those notices that do not
			
 
				+          pertain to any part of the Derivative Works, in at least one
			
 
				+          of the following places: within a NOTICE text file distributed
			
 
				+          as part of the Derivative Works; within the Source form or
			
 
				+          documentation, if provided along with the Derivative Works; or,
			
 
				+          within a display generated by the Derivative Works, if and
			
 
				+          wherever such third-party notices normally appear. The contents
			
 
				+          of the NOTICE file are for informational purposes only and
			
 
				+          do not modify the License. You may add Your own attribution
			
 
				+          notices within Derivative Works that You distribute, alongside
			
 
				+          or as an addendum to the NOTICE text from the Work, provided
			
 
				+          that such additional attribution notices cannot be construed
			
 
				+          as modifying the License.
			
 
				+
			
 
				+      You may add Your own copyright statement to Your modifications and
			
 
				+      may provide additional or different license terms and conditions
			
 
				+      for use, reproduction, or distribution of Your modifications, or
			
 
				+      for any such Derivative Works as a whole, provided Your use,
			
 
				+      reproduction, and distribution of the Work otherwise complies with
			
 
				+      the conditions stated in this License.
			
 
				+
			
 
				+   5. Submission of Contributions. Unless You explicitly state otherwise,
			
 
				+      any Contribution intentionally submitted for inclusion in the Work
			
 
				+      by You to the Licensor shall be under the terms and conditions of
			
 
				+      this License, without any additional terms or conditions.
			
 
				+      Notwithstanding the above, nothing herein shall supersede or modify
			
 
				+      the terms of any separate license agreement you may have executed
			
 
				+      with Licensor regarding such Contributions.
			
 
				+
			
 
				+   6. Trademarks. This License does not grant permission to use the trade
			
 
				+      names, trademarks, service marks, or product names of the Licensor,
			
 
				+      except as required for reasonable and customary use in describing the
			
 
				+      origin of the Work and reproducing the content of the NOTICE file.
			
 
				+
			
 
				+   7. Disclaimer of Warranty. Unless required by applicable law or
			
 
				+      agreed to in writing, Licensor provides the Work (and each
			
 
				+      Contributor provides its Contributions) on an "AS IS" BASIS,
			
 
				+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
			
 
				+      implied, including, without limitation, any warranties or conditions
			
 
				+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
			
 
				+      PARTICULAR PURPOSE. You are solely responsible for determining the
			
 
				+      appropriateness of using or redistributing the Work and assume any
			
 
				+      risks associated with Your exercise of permissions under this License.
			
 
				+
			
 
				+   8. Limitation of Liability. In no event and under no legal theory,
			
 
				+      whether in tort (including negligence), contract, or otherwise,
			
 
				+      unless required by applicable law (such as deliberate and grossly
			
 
				+      negligent acts) or agreed to in writing, shall any Contributor be
			
 
				+      liable to You for damages, including any direct, indirect, special,
			
 
				+      incidental, or consequential damages of any character arising as a
			
 
				+      result of this License or out of the use or inability to use the
			
 
				+      Work (including but not limited to damages for loss of goodwill,
			
 
				+      work stoppage, computer failure or malfunction, or any and all
			
 
				+      other commercial damages or losses), even if such Contributor
			
 
				+      has been advised of the possibility of such damages.
			
 
				+
			
 
				+   9. Accepting Warranty or Additional Liability. While redistributing
			
 
				+      the Work or Derivative Works thereof, You may choose to offer,
			
 
				+      and charge a fee for, acceptance of support, warranty, indemnity,
			
 
				+      or other liability obligations and/or rights consistent with this
			
 
				+      License. However, in accepting such obligations, You may act only
			
 
				+      on Your own behalf and on Your sole responsibility, not on behalf
			
 
				+      of any other Contributor, and only if You agree to indemnify,
			
 
				+      defend, and hold each Contributor harmless for any liability
			
 
				+      incurred by, or claims asserted against, such Contributor by reason
			
 
				+      of your accepting any such warranty or additional liability.
			
--- a/thirdparty/astcenc/astcenc.h
+++ b/thirdparty/astcenc/astcenc.h
@@ -0,0 +1,815 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2020-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief The core astcenc codec library interface.
			
 
				+ *
			
 
				+ * This interface is the entry point to the core astcenc codec. It aims to be easy to use for
			
 
				+ * non-experts, but also to allow experts to have fine control over the compressor heuristics if
			
 
				+ * needed. The core codec only handles compression and decompression, transferring all inputs and
			
 
				+ * outputs via memory buffers. To catch obvious input/output buffer sizing issues, which can cause
			
 
				+ * security and stability problems, all transfer buffers are explicitly sized.
			
 
				+ *
			
 
				+ * While the aim is that we keep this interface mostly stable, it should be viewed as a mutable
			
 
				+ * interface tied to a specific source version. We are not trying to maintain backwards
			
 
				+ * compatibility across codec versions.
			
 
				+ *
			
 
				+ * The API state management is based around an explicit context object, which is the context for all
			
 
				+ * allocated memory resources needed to compress and decompress a single image. A context can be
			
 
				+ * used to sequentially compress multiple images using the same configuration, allowing setup
			
 
				+ * overheads to be amortized over multiple images, which is particularly important when images are
			
 
				+ * small.
			
 
				+ *
			
 
				+ * Multi-threading can be used two ways.
			
 
				+ *
			
 
				+ *     * An application wishing to process multiple images in parallel can allocate multiple
			
 
				+ *       contexts and assign each context to a thread.
			
 
				+ *     * An application wishing to process a single image in using multiple threads can configure
			
 
				+ *       contexts for multi-threaded use, and invoke astcenc_compress/decompress() once per thread
			
 
				+ *       for faster processing. The caller is responsible for creating the worker threads, and
			
 
				+ *       synchronizing between images.
			
 
				+ *
			
 
				+ * Threading
			
 
				+ * =========
			
 
				+ *
			
 
				+ * In pseudo-code, the usage for manual user threading looks like this:
			
 
				+ *
			
 
				+ *     // Configure the compressor run
			
 
				+ *     astcenc_config my_config;
			
 
				+ *     astcenc_config_init(..., &my_config);
			
 
				+ *
			
 
				+ *     // Power users can tweak <my_config> settings here ...
			
 
				+ *
			
 
				+ *     // Allocate working state given config and thread_count
			
 
				+ *     astcenc_context* my_context;
			
 
				+ *     astcenc_context_alloc(&my_config, thread_count, &my_context);
			
 
				+ *
			
 
				+ *     // Compress each image using these config settings
			
 
				+ *     foreach image:
			
 
				+ *         // For each thread in the thread pool
			
 
				+ *         for i in range(0, thread_count):
			
 
				+ *             astcenc_compress_image(my_context, &my_input, my_output, i);
			
 
				+ *
			
 
				+ *         astcenc_compress_reset(my_context);
			
 
				+ *
			
 
				+ *     // Clean up
			
 
				+ *     astcenc_context_free(my_context);
			
 
				+ *
			
 
				+ * Images
			
 
				+ * ======
			
 
				+ *
			
 
				+ * The codec supports compressing single images, which can be either 2D images or volumetric 3D
			
 
				+ * images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
			
 
				+ * texture arrays, or sliced 3D textures.
			
 
				+ *
			
 
				+ * Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
			
 
				+ * half-float, or 32-bit float, as indicated by the data_type field.
			
 
				+ *
			
 
				+ * Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
			
 
				+ *
			
 
				+ * Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
			
 
				+ * within an image slice is always tightly packed without padding. Addressing looks like this:
			
 
				+ *
			
 
				+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4    ]   // Red
			
 
				+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1]   // Green
			
 
				+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 2]   // Blue
			
 
				+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 3]   // Alpha
			
 
				+ *
			
 
				+ * Common compressor usage
			
 
				+ * =======================
			
 
				+ *
			
 
				+ * One of the most important things for coding image quality is to align the input data component
			
 
				+ * count with the ASTC color endpoint mode. This avoids wasting bits encoding components you don't
			
 
				+ * actually need in the endpoint colors.
			
 
				+ *
			
 
				+ *         | Input data   | Encoding swizzle | Sampling swizzle |
			
 
				+ *         | ------------ | ---------------- | ---------------- |
			
 
				+ *         | 1 component  | RRR1             | .[rgb]           |
			
 
				+ *         | 2 components | RRRG             | .[rgb]a          |
			
 
				+ *         | 3 components | RGB1             | .rgb             |
			
 
				+ *         | 4 components | RGBA             | .rgba            |
			
 
				+ *
			
 
				+ * The 1 and 2 component modes recommend sampling from "g" to recover the luminance value as this
			
 
				+ * provide best compatibility with other texture formats where the green component may be stored at
			
 
				+ * higher precision than the others, such as RGB565. For ASTC any of the RGB components can be used;
			
 
				+ * the luminance endpoint component will be returned for all three.
			
 
				+ *
			
 
				+ * When using the normal map compression mode ASTC will store normals as a two component X+Y map.
			
 
				+ * Input images must contain unit-length normalized and should be passed in using a two component
			
 
				+ * swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
			
 
				+ * to use GGGR for compatability with BC5n which will work just as well. The Z component can be
			
 
				+ * recovered programmatically in shader code, using knowledge that the vector is unit length and
			
 
				+ * that Z must be positive for a tangent-space normal map.
			
 
				+ *
			
 
				+ * Decompress-only usage
			
 
				+ * =====================
			
 
				+ *
			
 
				+ * For some use cases it is useful to have a cut-down context and/or library which supports
			
 
				+ * decompression but not compression.
			
 
				+ *
			
 
				+ * A context can be made decompress-only using the ASTCENC_FLG_DECOMPRESS_ONLY flag when the context
			
 
				+ * is allocated. These contexts have lower dynamic memory footprint than a full context.
			
 
				+ *
			
 
				+ * The entire library can be made decompress-only by building the files with the define
			
 
				+ * ASTCENC_DECOMPRESS_ONLY set. In this build the context will be smaller, and the library will
			
 
				+ * exclude the functionality which is only needed for compression. This reduces the binary size by
			
 
				+ * ~180KB. For these builds contexts must be created with the ASTCENC_FLG_DECOMPRESS_ONLY flag.
			
 
				+ *
			
 
				+ * Note that context structures returned by a library built as decompress-only are incompatible with
			
 
				+ * a library built with compression included, and visa versa, as they have different sizes and
			
 
				+ * memory layout.
			
 
				+ *
			
 
				+ * Self-decompress-only usage
			
 
				+ * ==========================
			
 
				+ *
			
 
				+ * ASTC is a complex format with a large search space. The parts of this search space that are
			
 
				+ * searched is determined by heuristics that are, in part, tied to the quality level used when
			
 
				+ * creating the context.
			
 
				+ *
			
 
				+ * A normal context is capable of decompressing any ASTC texture, including those generated by other
			
 
				+ * compressors with unknown heuristics. This is the most flexible implementation, but forces the
			
 
				+ * data tables used by the codec to include entries that are not needed during compression. This
			
 
				+ * can slow down context creation by a significant amount, especially for the faster compression
			
 
				+ * modes where few data table entries are actually used. To optimize this use case the context can
			
 
				+ * be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will
			
 
				+ * only be asked to decompress images that it compressed itself, allowing the data tables to
			
 
				+ * exclude entries that are not needed by the current compression configuration. This reduces the
			
 
				+ * size of the context data tables in memory and improves context creation performance. Note that,
			
 
				+ * as of the 3.6 release, this flag no longer affects compression performance.
			
 
				+ *
			
 
				+ * Using this flag while attempting to decompress an valid image which was created by another
			
 
				+ * compressor, or even another astcenc compressor version or configuration, may result in blocks
			
 
				+ * returning as solid magenta or NaN value error blocks.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTCENC_INCLUDED
			
 
				+#define ASTCENC_INCLUDED
			
 
				+
			
 
				+#include <cstddef>
			
 
				+#include <cstdint>
			
 
				+
			
 
				+#if defined(ASTCENC_DYNAMIC_LIBRARY)
			
 
				+	#if defined(_MSC_VER)
			
 
				+		#define ASTCENC_PUBLIC extern "C" __declspec(dllexport)
			
 
				+	#else
			
 
				+		#define ASTCENC_PUBLIC extern "C" __attribute__ ((visibility ("default")))
			
 
				+	#endif
			
 
				+#else
			
 
				+	#define ASTCENC_PUBLIC
			
 
				+#endif
			
 
				+
			
 
				+/* ============================================================================
			
 
				+    Data declarations
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief An opaque structure; see astcenc_internal.h for definition.
			
 
				+ */
			
 
				+struct astcenc_context;
			
 
				+
			
 
				+/**
			
 
				+ * @brief A codec API error code.
			
 
				+ */
			
 
				+enum astcenc_error {
			
 
				+	/** @brief The call was successful. */
			
 
				+	ASTCENC_SUCCESS = 0,
			
 
				+	/** @brief The call failed due to low memory, or undersized I/O buffers. */
			
 
				+	ASTCENC_ERR_OUT_OF_MEM,
			
 
				+	/** @brief The call failed due to the build using fast math. */
			
 
				+	ASTCENC_ERR_BAD_CPU_FLOAT,
			
 
				+	/** @brief The call failed due to the build using an unsupported ISA. */
			
 
				+	ASTCENC_ERR_BAD_CPU_ISA,
			
 
				+	/** @brief The call failed due to an out-of-spec parameter. */
			
 
				+	ASTCENC_ERR_BAD_PARAM,
			
 
				+	/** @brief The call failed due to an out-of-spec block size. */
			
 
				+	ASTCENC_ERR_BAD_BLOCK_SIZE,
			
 
				+	/** @brief The call failed due to an out-of-spec color profile. */
			
 
				+	ASTCENC_ERR_BAD_PROFILE,
			
 
				+	/** @brief The call failed due to an out-of-spec quality value. */
			
 
				+	ASTCENC_ERR_BAD_QUALITY,
			
 
				+	/** @brief The call failed due to an out-of-spec component swizzle. */
			
 
				+	ASTCENC_ERR_BAD_SWIZZLE,
			
 
				+	/** @brief The call failed due to an out-of-spec flag set. */
			
 
				+	ASTCENC_ERR_BAD_FLAGS,
			
 
				+	/** @brief The call failed due to the context not supporting the operation. */
			
 
				+	ASTCENC_ERR_BAD_CONTEXT,
			
 
				+	/** @brief The call failed due to unimplemented functionality. */
			
 
				+	ASTCENC_ERR_NOT_IMPLEMENTED,
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	/** @brief The call failed due to an issue with diagnostic tracing. */
			
 
				+	ASTCENC_ERR_DTRACE_FAILURE,
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief A codec color profile.
			
 
				+ */
			
 
				+enum astcenc_profile {
			
 
				+	/** @brief The LDR sRGB color profile. */
			
 
				+	ASTCENC_PRF_LDR_SRGB = 0,
			
 
				+	/** @brief The LDR linear color profile. */
			
 
				+	ASTCENC_PRF_LDR,
			
 
				+	/** @brief The HDR RGB with LDR alpha color profile. */
			
 
				+	ASTCENC_PRF_HDR_RGB_LDR_A,
			
 
				+	/** @brief The HDR RGBA color profile. */
			
 
				+	ASTCENC_PRF_HDR
			
 
				+};
			
 
				+
			
 
				+/** @brief The fastest, lowest quality, search preset. */
			
 
				+static const float ASTCENC_PRE_FASTEST = 0.0f;
			
 
				+
			
 
				+/** @brief The fast search preset. */
			
 
				+static const float ASTCENC_PRE_FAST = 10.0f;
			
 
				+
			
 
				+/** @brief The medium quality search preset. */
			
 
				+static const float ASTCENC_PRE_MEDIUM = 60.0f;
			
 
				+
			
 
				+/** @brief The thorough quality search preset. */
			
 
				+static const float ASTCENC_PRE_THOROUGH = 98.0f;
			
 
				+
			
 
				+/** @brief The thorough quality search preset. */
			
 
				+static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
			
 
				+
			
 
				+/** @brief The exhaustive, highest quality, search preset. */
			
 
				+static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
			
 
				+
			
 
				+/**
			
 
				+ * @brief A codec component swizzle selector.
			
 
				+ */
			
 
				+enum astcenc_swz
			
 
				+{
			
 
				+	/** @brief Select the red component. */
			
 
				+	ASTCENC_SWZ_R = 0,
			
 
				+	/** @brief Select the green component. */
			
 
				+	ASTCENC_SWZ_G = 1,
			
 
				+	/** @brief Select the blue component. */
			
 
				+	ASTCENC_SWZ_B = 2,
			
 
				+	/** @brief Select the alpha component. */
			
 
				+	ASTCENC_SWZ_A = 3,
			
 
				+	/** @brief Use a constant zero component. */
			
 
				+	ASTCENC_SWZ_0 = 4,
			
 
				+	/** @brief Use a constant one component. */
			
 
				+	ASTCENC_SWZ_1 = 5,
			
 
				+	/** @brief Use a reconstructed normal vector Z component. */
			
 
				+	ASTCENC_SWZ_Z = 6
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief A texel component swizzle.
			
 
				+ */
			
 
				+struct astcenc_swizzle
			
 
				+{
			
 
				+	/** @brief The red component selector. */
			
 
				+	astcenc_swz r;
			
 
				+	/** @brief The green component selector. */
			
 
				+	astcenc_swz g;
			
 
				+	/** @brief The blue component selector. */
			
 
				+	astcenc_swz b;
			
 
				+	/** @brief The alpha component selector. */
			
 
				+	astcenc_swz a;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief A texel component data format.
			
 
				+ */
			
 
				+enum astcenc_type
			
 
				+{
			
 
				+	/** @brief Unorm 8-bit data per component. */
			
 
				+	ASTCENC_TYPE_U8 = 0,
			
 
				+	/** @brief 16-bit float per component. */
			
 
				+	ASTCENC_TYPE_F16 = 1,
			
 
				+	/** @brief 32-bit float per component. */
			
 
				+	ASTCENC_TYPE_F32 = 2
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Enable normal map compression.
			
 
				+ *
			
 
				+ * Input data will be treated a two component normal map, storing X and Y, and the codec will
			
 
				+ * optimize for angular error rather than simple linear PSNR. In this mode the input swizzle should
			
 
				+ * be e.g. rrrg (the default ordering for ASTC normals on the command line) or gggr (the ordering
			
 
				+ * used by BC5n).
			
 
				+ */
			
 
				+static const unsigned int ASTCENC_FLG_MAP_NORMAL          = 1 << 0;
			
 
				+
			
 
				+/**
			
 
				+ * @brief Enable alpha weighting.
			
 
				+ *
			
 
				+ * The input alpha value is used for transparency, so errors in the RGB components are weighted by
			
 
				+ * the transparency level. This allows the codec to more accurately encode the alpha value in areas
			
 
				+ * where the color value is less significant.
			
 
				+ */
			
 
				+static const unsigned int ASTCENC_FLG_USE_ALPHA_WEIGHT     = 1 << 2;
			
 
				+
			
 
				+/**
			
 
				+ * @brief Enable perceptual error metrics.
			
 
				+ *
			
 
				+ * This mode enables perceptual compression mode, which will optimize for perceptual error rather
			
 
				+ * than best PSNR. Only some input modes support perceptual error metrics.
			
 
				+ */
			
 
				+static const unsigned int ASTCENC_FLG_USE_PERCEPTUAL       = 1 << 3;
			
 
				+
			
 
				+/**
			
 
				+ * @brief Create a decompression-only context.
			
 
				+ *
			
 
				+ * This mode disables support for compression. This enables context allocation to skip some
			
 
				+ * transient buffer allocation, resulting in lower memory usage.
			
 
				+ */
			
 
				+static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY      = 1 << 4;
			
 
				+
			
 
				+/**
			
 
				+ * @brief Create a self-decompression context.
			
 
				+ *
			
 
				+ * This mode configures the compressor so that it is only guaranteed to be able to decompress images
			
 
				+ * that were actually created using the current context. This is the common case for compression use
			
 
				+ * cases, and setting this flag enables additional optimizations, but does mean that the context
			
 
				+ * cannot reliably decompress arbitrary ASTC images.
			
 
				+ */
			
 
				+static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
			
 
				+
			
 
				+/**
			
 
				+ * @brief Enable RGBM map compression.
			
 
				+ *
			
 
				+ * Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
			
 
				+ * format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
			
 
				+ * compression function, this flag is only used to control the use of RGBM-specific heuristics and
			
 
				+ * error metrics.
			
 
				+ *
			
 
				+ * IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
			
 
				+ * M values can round to zero due to quantization and result in black or white pixels. It is highly
			
 
				+ * recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
			
 
				+ * 16 or 32). Applying this threshold reduces the number of very dark colors that can be
			
 
				+ * represented, but is still higher precision than 8-bit LDR.
			
 
				+ *
			
 
				+ * When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
			
 
				+ * factor used during reconstruction. This defaults to 5 when in RGBM mode.
			
 
				+ *
			
 
				+ * It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
			
 
				+ * scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
			
 
				+ * matching the default scale factor.
			
 
				+ */
			
 
				+static const unsigned int ASTCENC_FLG_MAP_RGBM             = 1 << 6;
			
 
				+
			
 
				+/**
			
 
				+ * @brief The bit mask of all valid flags.
			
 
				+ */
			
 
				+static const unsigned int ASTCENC_ALL_FLAGS =
			
 
				+                              ASTCENC_FLG_MAP_NORMAL |
			
 
				+                              ASTCENC_FLG_MAP_RGBM |
			
 
				+                              ASTCENC_FLG_USE_ALPHA_WEIGHT |
			
 
				+                              ASTCENC_FLG_USE_PERCEPTUAL |
			
 
				+                              ASTCENC_FLG_DECOMPRESS_ONLY |
			
 
				+                              ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
			
 
				+
			
 
				+/**
			
 
				+ * @brief The config structure.
			
 
				+ *
			
 
				+ * This structure will initially be populated by a call to astcenc_config_init, but power users may
			
 
				+ * modify it before calling astcenc_context_alloc. See astcenccli_toplevel_help.cpp for full user
			
 
				+ * documentation of the power-user settings.
			
 
				+ *
			
 
				+ * Note for any settings which are associated with a specific color component, the value in the
			
 
				+ * config applies to the component that exists after any compression data swizzle is applied.
			
 
				+ */
			
 
				+struct astcenc_config
			
 
				+{
			
 
				+	/** @brief The color profile. */
			
 
				+	astcenc_profile profile;
			
 
				+
			
 
				+	/** @brief The set of set flags. */
			
 
				+	unsigned int flags;
			
 
				+
			
 
				+	/** @brief The ASTC block size X dimension. */
			
 
				+	unsigned int block_x;
			
 
				+
			
 
				+	/** @brief The ASTC block size Y dimension. */
			
 
				+	unsigned int block_y;
			
 
				+
			
 
				+	/** @brief The ASTC block size Z dimension. */
			
 
				+	unsigned int block_z;
			
 
				+
			
 
				+	/** @brief The red component weight scale for error weighting (-cw). */
			
 
				+	float cw_r_weight;
			
 
				+
			
 
				+	/** @brief The green component weight scale for error weighting (-cw). */
			
 
				+	float cw_g_weight;
			
 
				+
			
 
				+	/** @brief The blue component weight scale for error weighting (-cw). */
			
 
				+	float cw_b_weight;
			
 
				+
			
 
				+	/** @brief The alpha component weight scale for error weighting (-cw). */
			
 
				+	float cw_a_weight;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The radius for any alpha-weight scaling (-a).
			
 
				+	 *
			
 
				+	 * It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
			
 
				+	 * will be sampled using linear texture filtering to minimize color bleed out of transparent
			
 
				+	 * texels that are adjacent to non-transparent texels.
			
 
				+	 */
			
 
				+	unsigned int a_scale_radius;
			
 
				+
			
 
				+	/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
			
 
				+	float rgbm_m_scale;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The maximum number of partitions searched (-partitioncountlimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and 4.
			
 
				+	 */
			
 
				+	unsigned int tune_partition_count_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The maximum number of partitions searched (-2partitionindexlimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and 1024.
			
 
				+	 */
			
 
				+	unsigned int tune_2partition_index_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The maximum number of partitions searched (-3partitionindexlimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and 1024.
			
 
				+	 */
			
 
				+	unsigned int tune_3partition_index_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The maximum number of partitions searched (-4partitionindexlimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and 1024.
			
 
				+	 */
			
 
				+	unsigned int tune_4partition_index_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The maximum centile for block modes searched (-blockmodelimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and 100.
			
 
				+	 */
			
 
				+	unsigned int tune_block_mode_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The maximum iterative refinements applied (-refinementlimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and N; there is no technical upper limit
			
 
				+	 * but little benefit is expected after N=4.
			
 
				+	 */
			
 
				+	unsigned int tune_refinement_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of trial candidates per mode search (-candidatelimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES (default 4).
			
 
				+	 */
			
 
				+	unsigned int tune_candidate_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of trial partitionings per search (-2partitioncandidatelimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
			
 
				+	 */
			
 
				+	unsigned int tune_2partitioning_candidate_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of trial partitionings per search (-3partitioncandidatelimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
			
 
				+	 */
			
 
				+	unsigned int tune_3partitioning_candidate_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of trial partitionings per search (-4partitioncandidatelimit).
			
 
				+	 *
			
 
				+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
			
 
				+	 */
			
 
				+	unsigned int tune_4partitioning_candidate_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The dB threshold for stopping block search (-dblimit).
			
 
				+	 *
			
 
				+	 * This option is ineffective for HDR textures.
			
 
				+	 */
			
 
				+	float tune_db_limit;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The amount of MSE overshoot needed to early-out trials.
			
 
				+	 *
			
 
				+	 * The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
			
 
				+	 * the high probability block modes. This can short-cut compression for simple blocks.
			
 
				+	 *
			
 
				+	 * The second early-out is for refinement trials, where we can exit refinement once quality is
			
 
				+	 * reached.
			
 
				+	 */
			
 
				+	float tune_mse_overshoot;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
			
 
				+	 *
			
 
				+	 * This option is further scaled for normal maps, so it skips less often.
			
 
				+	 */
			
 
				+	float tune_2_partition_early_out_limit_factor;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
			
 
				+	 *
			
 
				+	 * This option is further scaled for normal maps, so it skips less often.
			
 
				+	 */
			
 
				+	float tune_3_partition_early_out_limit_factor;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
			
 
				+	 *
			
 
				+	 * This option is ineffective for normal maps.
			
 
				+	 */
			
 
				+	float tune_2_plane_early_out_limit_correlation;
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	/**
			
 
				+	 * @brief The path to save the diagnostic trace data to.
			
 
				+	 *
			
 
				+	 * This option is not part of the public API, and requires special builds
			
 
				+	 * of the library.
			
 
				+	 */
			
 
				+	const char* trace_file_path;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief An uncompressed 2D or 3D image.
			
 
				+ *
			
 
				+ * 3D image are passed in as an array of 2D slices. Each slice has identical
			
 
				+ * size and color format.
			
 
				+ */
			
 
				+struct astcenc_image
			
 
				+{
			
 
				+	/** @brief The X dimension of the image, in texels. */
			
 
				+	unsigned int dim_x;
			
 
				+
			
 
				+	/** @brief The Y dimension of the image, in texels. */
			
 
				+	unsigned int dim_y;
			
 
				+
			
 
				+	/** @brief The Z dimension of the image, in texels. */
			
 
				+	unsigned int dim_z;
			
 
				+
			
 
				+	/** @brief The data type per component. */
			
 
				+	astcenc_type data_type;
			
 
				+
			
 
				+	/** @brief The array of 2D slices, of length @c dim_z. */
			
 
				+	void** data;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief A block encoding metadata query result.
			
 
				+ *
			
 
				+ * If the block is an error block or a constant color block or an error block all fields other than
			
 
				+ * the profile, block dimensions, and error/constant indicator will be zero.
			
 
				+ */
			
 
				+struct astcenc_block_info
			
 
				+{
			
 
				+	/** @brief The block encoding color profile. */
			
 
				+	astcenc_profile profile;
			
 
				+
			
 
				+	/** @brief The number of texels in the X dimension. */
			
 
				+	unsigned int block_x;
			
 
				+
			
 
				+	/** @brief The number of texels in the Y dimension. */
			
 
				+	unsigned int block_y;
			
 
				+
			
 
				+	/** @brief The number of texel in the Z dimension. */
			
 
				+	unsigned int block_z;
			
 
				+
			
 
				+	/** @brief The number of texels in the block. */
			
 
				+	unsigned int texel_count;
			
 
				+
			
 
				+	/** @brief True if this block is an error block. */
			
 
				+	bool is_error_block;
			
 
				+
			
 
				+	/** @brief True if this block is a constant color block. */
			
 
				+	bool is_constant_block;
			
 
				+
			
 
				+	/** @brief True if this block is an HDR block. */
			
 
				+	bool is_hdr_block;
			
 
				+
			
 
				+	/** @brief True if this block uses two weight planes. */
			
 
				+	bool is_dual_plane_block;
			
 
				+
			
 
				+	/** @brief The number of partitions if not constant color. */
			
 
				+	unsigned int partition_count;
			
 
				+
			
 
				+	/** @brief The partition index if 2 - 4 partitions used. */
			
 
				+	unsigned int partition_index;
			
 
				+
			
 
				+	/** @brief The component index of the second plane if dual plane. */
			
 
				+	unsigned int dual_plane_component;
			
 
				+
			
 
				+	/** @brief The color endpoint encoding mode for each partition. */
			
 
				+	unsigned int color_endpoint_modes[4];
			
 
				+
			
 
				+	/** @brief The number of color endpoint quantization levels. */
			
 
				+	unsigned int color_level_count;
			
 
				+
			
 
				+	/** @brief The number of weight quantization levels. */
			
 
				+	unsigned int weight_level_count;
			
 
				+
			
 
				+	/** @brief The number of weights in the X dimension. */
			
 
				+	unsigned int weight_x;
			
 
				+
			
 
				+	/** @brief The number of weights in the Y dimension. */
			
 
				+	unsigned int weight_y;
			
 
				+
			
 
				+	/** @brief The number of weights in the Z dimension. */
			
 
				+	unsigned int weight_z;
			
 
				+
			
 
				+	/** @brief The unpacked color endpoints for each partition. */
			
 
				+	float color_endpoints[4][2][4];
			
 
				+
			
 
				+	/** @brief The per-texel interpolation weights for the block. */
			
 
				+	float weight_values_plane1[216];
			
 
				+
			
 
				+	/** @brief The per-texel interpolation weights for the block. */
			
 
				+	float weight_values_plane2[216];
			
 
				+
			
 
				+	/** @brief The per-texel partition assignments for the block. */
			
 
				+	uint8_t partition_assignment[216];
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Populate a codec config based on default settings.
			
 
				+ *
			
 
				+ * Power users can edit the returned config struct to fine tune before allocating the context.
			
 
				+ *
			
 
				+ * @param      profile   Color profile.
			
 
				+ * @param      block_x   ASTC block size X dimension.
			
 
				+ * @param      block_y   ASTC block size Y dimension.
			
 
				+ * @param      block_z   ASTC block size Z dimension.
			
 
				+ * @param      quality   Search quality preset / effort level. Either an
			
 
				+ *                       @c ASTCENC_PRE_* value, or a effort level between 0
			
 
				+ *                       and 100. Performance is not linear between 0 and 100.
			
 
				+
			
 
				+ * @param      flags     A valid set of @c ASTCENC_FLG_* flag bits.
			
 
				+ * @param[out] config    Output config struct to populate.
			
 
				+ *
			
 
				+ * @return @c ASTCENC_SUCCESS on success, or an error if the inputs are invalid
			
 
				+ * either individually, or in combination.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC astcenc_error astcenc_config_init(
			
 
				+	astcenc_profile profile,
			
 
				+	unsigned int block_x,
			
 
				+	unsigned int block_y,
			
 
				+	unsigned int block_z,
			
 
				+	float quality,
			
 
				+	unsigned int flags,
			
 
				+	astcenc_config* config);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Allocate a new codec context based on a config.
			
 
				+ *
			
 
				+ * This function allocates all of the memory resources and threads needed by the codec. This can be
			
 
				+ * slow, so it is recommended that contexts are reused to serially compress or decompress multiple
			
 
				+ * images to amortize setup cost.
			
 
				+ *
			
 
				+ * Contexts can be allocated to support only decompression using the @c ASTCENC_FLG_DECOMPRESS_ONLY
			
 
				+ * flag when creating the configuration. The compression functions will fail if invoked. For a
			
 
				+ * decompress-only library build the @c ASTCENC_FLG_DECOMPRESS_ONLY flag must be set when creating
			
 
				+ * any context.
			
 
				+ *
			
 
				+ * @param[in]  config         Codec config.
			
 
				+ * @param      thread_count   Thread count to configure for.
			
 
				+ * @param[out] context        Location to store an opaque context pointer.
			
 
				+ *
			
 
				+ * @return @c ASTCENC_SUCCESS on success, or an error if context creation failed.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC astcenc_error astcenc_context_alloc(
			
 
				+	const astcenc_config* config,
			
 
				+	unsigned int thread_count,
			
 
				+	astcenc_context** context);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compress an image.
			
 
				+ *
			
 
				+ * A single context can only compress or decompress a single image at a time.
			
 
				+ *
			
 
				+ * For a context configured for multi-threading, any set of the N threads can call this function.
			
 
				+ * Work will be dynamically scheduled across the threads available. Each thread must have a unique
			
 
				+ * @c thread_index.
			
 
				+ *
			
 
				+ * @param         context        Codec context.
			
 
				+ * @param[in,out] image          An input image, in 2D slices.
			
 
				+ * @param         swizzle        Compression data swizzle, applied before compression.
			
 
				+ * @param[out]    data_out       Pointer to output data array.
			
 
				+ * @param         data_len       Length of the output data array.
			
 
				+ * @param         thread_index   Thread index [0..N-1] of calling thread.
			
 
				+ *
			
 
				+ * @return @c ASTCENC_SUCCESS on success, or an error if compression failed.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
			
 
				+	astcenc_context* context,
			
 
				+	astcenc_image* image,
			
 
				+	const astcenc_swizzle* swizzle,
			
 
				+	uint8_t* data_out,
			
 
				+	size_t data_len,
			
 
				+	unsigned int thread_index);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Reset the codec state for a new compression.
			
 
				+ *
			
 
				+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
			
 
				+ * only be called when all threads have exited the @c astcenc_compress_image() function for image N,
			
 
				+ * but before any thread enters it for image N + 1.
			
 
				+ *
			
 
				+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
			
 
				+ *
			
 
				+ * @param context   Codec context.
			
 
				+ *
			
 
				+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
			
 
				+	astcenc_context* context);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Decompress an image.
			
 
				+ *
			
 
				+ * @param         context        Codec context.
			
 
				+ * @param[in]     data           Pointer to compressed data.
			
 
				+ * @param         data_len       Length of the compressed data, in bytes.
			
 
				+ * @param[in,out] image_out      Output image.
			
 
				+ * @param         swizzle        Decompression data swizzle, applied after decompression.
			
 
				+ * @param         thread_index   Thread index [0..N-1] of calling thread.
			
 
				+ *
			
 
				+ * @return @c ASTCENC_SUCCESS on success, or an error if decompression failed.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC astcenc_error astcenc_decompress_image(
			
 
				+	astcenc_context* context,
			
 
				+	const uint8_t* data,
			
 
				+	size_t data_len,
			
 
				+	astcenc_image* image_out,
			
 
				+	const astcenc_swizzle* swizzle,
			
 
				+	unsigned int thread_index);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Reset the codec state for a new decompression.
			
 
				+ *
			
 
				+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
			
 
				+ * only be called when all threads have exited the @c astcenc_decompress_image() function for image
			
 
				+ * N, but before any thread enters it for image N + 1.
			
 
				+ *
			
 
				+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
			
 
				+ *
			
 
				+ * @param context   Codec context.
			
 
				+ *
			
 
				+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC astcenc_error astcenc_decompress_reset(
			
 
				+	astcenc_context* context);
			
 
				+
			
 
				+/**
			
 
				+ * Free the compressor context.
			
 
				+ *
			
 
				+ * @param context   The codec context.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC void astcenc_context_free(
			
 
				+	astcenc_context* context);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Provide a high level summary of a block's encoding.
			
 
				+ *
			
 
				+ * This feature is primarily useful for codec developers but may be useful for developers building
			
 
				+ * advanced content packaging pipelines.
			
 
				+ *
			
 
				+ * @param context   Codec context.
			
 
				+ * @param data      One block of compressed ASTC data.
			
 
				+ * @param info      The output info structure to populate.
			
 
				+ *
			
 
				+ * @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
			
 
				+ *         function will return success even if the block itself was an error block encoding, as the
			
 
				+ *         decode was correctly handled.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC astcenc_error astcenc_get_block_info(
			
 
				+	astcenc_context* context,
			
 
				+	const uint8_t data[16],
			
 
				+	astcenc_block_info* info);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Get a printable string for specific status code.
			
 
				+ *
			
 
				+ * @param status   The status value.
			
 
				+ *
			
 
				+ * @return A human readable nul-terminated string.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC const char* astcenc_get_error_string(
			
 
				+	astcenc_error status);
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_averages_and_directions.cpp
+++ b/thirdparty/astcenc/astcenc_averages_and_directions.cpp
@@ -0,0 +1,995 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for finding dominant direction of a set of colors.
			
 
				+ */
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#include <cassert>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the average RGB color of each partition.
			
 
				+ *
			
 
				+ * The algorithm here uses a vectorized sequential scan and per-partition
			
 
				+ * color accumulators, using select() to mask texel lanes in other partitions.
			
 
				+ *
			
 
				+ * We only accumulate sums for N-1 partitions during the scan; the value for
			
 
				+ * the last partition can be computed given that we know the block-wide average
			
 
				+ * already.
			
 
				+ *
			
 
				+ * Because of this we could reduce the loop iteration count so it "just" spans
			
 
				+ * the max texel index needed for the N-1 partitions, which could need fewer
			
 
				+ * iterations than the full block texel count. However, this makes the loop
			
 
				+ * count erratic and causes more branch mispredictions so is a net loss.
			
 
				+ *
			
 
				+ * @param      pi         The partitioning to use.
			
 
				+ * @param      blk        The block data to process.
			
 
				+ * @param[out] averages   The output averages. Unused partition indices will
			
 
				+ *                        not be initialized, and lane<3> will be zero.
			
 
				+ */
			
 
				+static void compute_partition_averages_rgb(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
			
 
				+) {
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	unsigned int texel_count = blk.texel_count;
			
 
				+	promise(texel_count > 0);
			
 
				+
			
 
				+	// For 1 partition just use the precomputed mean
			
 
				+	if (partition_count == 1)
			
 
				+	{
			
 
				+		averages[0] = blk.data_mean.swz<0, 1, 2>();
			
 
				+	}
			
 
				+	// For 2 partitions scan results for partition 0, compute partition 1
			
 
				+	else if (partition_count == 2)
			
 
				+	{
			
 
				+		vfloatacc pp_avg_rgb[3] {};
			
 
				+
			
 
				+		vint lane_id = vint::lane_id();
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint texel_partition(pi.partition_of_texel + i);
			
 
				+
			
 
				+			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				+
			
 
				+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
 
				+
			
 
				+			vfloat data_r = loada(blk.data_r + i);
			
 
				+			haccumulate(pp_avg_rgb[0], data_r, p0_mask);
			
 
				+
			
 
				+			vfloat data_g = loada(blk.data_g + i);
			
 
				+			haccumulate(pp_avg_rgb[1], data_g, p0_mask);
			
 
				+
			
 
				+			vfloat data_b = loada(blk.data_b + i);
			
 
				+			haccumulate(pp_avg_rgb[2], data_b, p0_mask);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
			
 
				+
			
 
				+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
			
 
				+		                           hadd_s(pp_avg_rgb[1]),
			
 
				+		                           hadd_s(pp_avg_rgb[2]));
			
 
				+
			
 
				+		vfloat4 p1_total = block_total - p0_total;
			
 
				+
			
 
				+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
			
 
				+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
			
 
				+	}
			
 
				+	// For 3 partitions scan results for partition 0/1, compute partition 2
			
 
				+	else if (partition_count == 3)
			
 
				+	{
			
 
				+		vfloatacc pp_avg_rgb[2][3] {};
			
 
				+
			
 
				+		vint lane_id = vint::lane_id();
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint texel_partition(pi.partition_of_texel + i);
			
 
				+
			
 
				+			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				+
			
 
				+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
 
				+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
			
 
				+
			
 
				+			vfloat data_r = loada(blk.data_r + i);
			
 
				+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
			
 
				+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
			
 
				+
			
 
				+			vfloat data_g = loada(blk.data_g + i);
			
 
				+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
			
 
				+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
			
 
				+
			
 
				+			vfloat data_b = loada(blk.data_b + i);
			
 
				+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
			
 
				+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
			
 
				+
			
 
				+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
			
 
				+		                           hadd_s(pp_avg_rgb[0][1]),
			
 
				+		                           hadd_s(pp_avg_rgb[0][2]));
			
 
				+
			
 
				+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
			
 
				+		                           hadd_s(pp_avg_rgb[1][1]),
			
 
				+		                           hadd_s(pp_avg_rgb[1][2]));
			
 
				+
			
 
				+		vfloat4 p2_total = block_total - p0_total - p1_total;
			
 
				+
			
 
				+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
			
 
				+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
			
 
				+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
			
 
				+		vfloatacc pp_avg_rgb[3][3] {};
			
 
				+
			
 
				+		vint lane_id = vint::lane_id();
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint texel_partition(pi.partition_of_texel + i);
			
 
				+
			
 
				+			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				+
			
 
				+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
 
				+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
			
 
				+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
			
 
				+
			
 
				+			vfloat data_r = loada(blk.data_r + i);
			
 
				+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
			
 
				+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
			
 
				+			haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
			
 
				+
			
 
				+			vfloat data_g = loada(blk.data_g + i);
			
 
				+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
			
 
				+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
			
 
				+			haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
			
 
				+
			
 
				+			vfloat data_b = loada(blk.data_b + i);
			
 
				+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
			
 
				+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
			
 
				+			haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
			
 
				+
			
 
				+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
			
 
				+		                           hadd_s(pp_avg_rgb[0][1]),
			
 
				+		                           hadd_s(pp_avg_rgb[0][2]));
			
 
				+
			
 
				+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
			
 
				+		                           hadd_s(pp_avg_rgb[1][1]),
			
 
				+		                           hadd_s(pp_avg_rgb[1][2]));
			
 
				+
			
 
				+		vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
			
 
				+		                           hadd_s(pp_avg_rgb[2][1]),
			
 
				+		                           hadd_s(pp_avg_rgb[2][2]));
			
 
				+
			
 
				+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
			
 
				+
			
 
				+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
			
 
				+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
			
 
				+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
			
 
				+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the average RGBA color of each partition.
			
 
				+ *
			
 
				+ * The algorithm here uses a vectorized sequential scan and per-partition
			
 
				+ * color accumulators, using select() to mask texel lanes in other partitions.
			
 
				+ *
			
 
				+ * We only accumulate sums for N-1 partitions during the scan; the value for
			
 
				+ * the last partition can be computed given that we know the block-wide average
			
 
				+ * already.
			
 
				+ *
			
 
				+ * Because of this we could reduce the loop iteration count so it "just" spans
			
 
				+ * the max texel index needed for the N-1 partitions, which could need fewer
			
 
				+ * iterations than the full block texel count. However, this makes the loop
			
 
				+ * count erratic and causes more branch mispredictions so is a net loss.
			
 
				+ *
			
 
				+ * @param      pi         The partitioning to use.
			
 
				+ * @param      blk        The block data to process.
			
 
				+ * @param[out] averages   The output averages. Unused partition indices will
			
 
				+ *                        not be initialized.
			
 
				+ */
			
 
				+static void compute_partition_averages_rgba(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
			
 
				+) {
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	unsigned int texel_count = blk.texel_count;
			
 
				+	promise(texel_count > 0);
			
 
				+
			
 
				+	// For 1 partition just use the precomputed mean
			
 
				+	if (partition_count == 1)
			
 
				+	{
			
 
				+		averages[0] = blk.data_mean;
			
 
				+	}
			
 
				+	// For 2 partitions scan results for partition 0, compute partition 1
			
 
				+	else if (partition_count == 2)
			
 
				+	{
			
 
				+		vfloat4 pp_avg_rgba[4] {};
			
 
				+
			
 
				+		vint lane_id = vint::lane_id();
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint texel_partition(pi.partition_of_texel + i);
			
 
				+
			
 
				+			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				+
			
 
				+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
 
				+
			
 
				+			vfloat data_r = loada(blk.data_r + i);
			
 
				+			haccumulate(pp_avg_rgba[0], data_r, p0_mask);
			
 
				+
			
 
				+			vfloat data_g = loada(blk.data_g + i);
			
 
				+			haccumulate(pp_avg_rgba[1], data_g, p0_mask);
			
 
				+
			
 
				+			vfloat data_b = loada(blk.data_b + i);
			
 
				+			haccumulate(pp_avg_rgba[2], data_b, p0_mask);
			
 
				+
			
 
				+			vfloat data_a = loada(blk.data_a + i);
			
 
				+			haccumulate(pp_avg_rgba[3], data_a, p0_mask);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
			
 
				+
			
 
				+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
			
 
				+		                           hadd_s(pp_avg_rgba[1]),
			
 
				+		                           hadd_s(pp_avg_rgba[2]),
			
 
				+		                           hadd_s(pp_avg_rgba[3]));
			
 
				+
			
 
				+		vfloat4 p1_total = block_total - p0_total;
			
 
				+
			
 
				+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
			
 
				+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
			
 
				+	}
			
 
				+	// For 3 partitions scan results for partition 0/1, compute partition 2
			
 
				+	else if (partition_count == 3)
			
 
				+	{
			
 
				+		vfloat4 pp_avg_rgba[2][4] {};
			
 
				+
			
 
				+		vint lane_id = vint::lane_id();
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint texel_partition(pi.partition_of_texel + i);
			
 
				+
			
 
				+			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				+
			
 
				+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
 
				+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
			
 
				+
			
 
				+			vfloat data_r = loada(blk.data_r + i);
			
 
				+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
			
 
				+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
			
 
				+
			
 
				+			vfloat data_g = loada(blk.data_g + i);
			
 
				+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
			
 
				+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
			
 
				+
			
 
				+			vfloat data_b = loada(blk.data_b + i);
			
 
				+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
			
 
				+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
			
 
				+
			
 
				+			vfloat data_a = loada(blk.data_a + i);
			
 
				+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
			
 
				+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
			
 
				+
			
 
				+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
			
 
				+		                           hadd_s(pp_avg_rgba[0][1]),
			
 
				+		                           hadd_s(pp_avg_rgba[0][2]),
			
 
				+		                           hadd_s(pp_avg_rgba[0][3]));
			
 
				+
			
 
				+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
			
 
				+		                           hadd_s(pp_avg_rgba[1][1]),
			
 
				+		                           hadd_s(pp_avg_rgba[1][2]),
			
 
				+		                           hadd_s(pp_avg_rgba[1][3]));
			
 
				+
			
 
				+		vfloat4 p2_total = block_total - p0_total - p1_total;
			
 
				+
			
 
				+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
			
 
				+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
			
 
				+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
			
 
				+		vfloat4 pp_avg_rgba[3][4] {};
			
 
				+
			
 
				+		vint lane_id = vint::lane_id();
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint texel_partition(pi.partition_of_texel + i);
			
 
				+
			
 
				+			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				+
			
 
				+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
 
				+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
			
 
				+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
			
 
				+
			
 
				+			vfloat data_r = loada(blk.data_r + i);
			
 
				+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
			
 
				+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
			
 
				+			haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
			
 
				+
			
 
				+			vfloat data_g = loada(blk.data_g + i);
			
 
				+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
			
 
				+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
			
 
				+			haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
			
 
				+
			
 
				+			vfloat data_b = loada(blk.data_b + i);
			
 
				+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
			
 
				+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
			
 
				+			haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
			
 
				+
			
 
				+			vfloat data_a = loada(blk.data_a + i);
			
 
				+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
			
 
				+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
			
 
				+			haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
			
 
				+
			
 
				+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
			
 
				+		                           hadd_s(pp_avg_rgba[0][1]),
			
 
				+		                           hadd_s(pp_avg_rgba[0][2]),
			
 
				+		                           hadd_s(pp_avg_rgba[0][3]));
			
 
				+
			
 
				+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
			
 
				+		                           hadd_s(pp_avg_rgba[1][1]),
			
 
				+		                           hadd_s(pp_avg_rgba[1][2]),
			
 
				+		                           hadd_s(pp_avg_rgba[1][3]));
			
 
				+
			
 
				+		vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
			
 
				+		                           hadd_s(pp_avg_rgba[2][1]),
			
 
				+		                           hadd_s(pp_avg_rgba[2][2]),
			
 
				+		                           hadd_s(pp_avg_rgba[2][3]));
			
 
				+
			
 
				+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
			
 
				+
			
 
				+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
			
 
				+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
			
 
				+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
			
 
				+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_avgs_and_dirs_4_comp(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
			
 
				+) {
			
 
				+	int partition_count = pi.partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	// Pre-compute partition_averages
			
 
				+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
			
 
				+	compute_partition_averages_rgba(pi, blk, partition_averages);
			
 
				+
			
 
				+	for (int partition = 0; partition < partition_count; partition++)
			
 
				+	{
			
 
				+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				+		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		promise(texel_count > 0);
			
 
				+
			
 
				+		vfloat4 average = partition_averages[partition];
			
 
				+		pm[partition].avg = average;
			
 
				+
			
 
				+		vfloat4 sum_xp = vfloat4::zero();
			
 
				+		vfloat4 sum_yp = vfloat4::zero();
			
 
				+		vfloat4 sum_zp = vfloat4::zero();
			
 
				+		vfloat4 sum_wp = vfloat4::zero();
			
 
				+
			
 
				+		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		{
			
 
				+			unsigned int iwt = texel_indexes[i];
			
 
				+			vfloat4 texel_datum = blk.texel(iwt);
			
 
				+			texel_datum = texel_datum - average;
			
 
				+
			
 
				+			vfloat4 zero = vfloat4::zero();
			
 
				+
			
 
				+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
			
 
				+			sum_xp += select(zero, texel_datum, tdm0);
			
 
				+
			
 
				+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
			
 
				+			sum_yp += select(zero, texel_datum, tdm1);
			
 
				+
			
 
				+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
			
 
				+			sum_zp += select(zero, texel_datum, tdm2);
			
 
				+
			
 
				+			vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
			
 
				+			sum_wp += select(zero, texel_datum, tdm3);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
			
 
				+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
			
 
				+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
			
 
				+		vfloat4 prod_wp = dot(sum_wp, sum_wp);
			
 
				+
			
 
				+		vfloat4 best_vector = sum_xp;
			
 
				+		vfloat4 best_sum = prod_xp;
			
 
				+
			
 
				+		vmask4 mask = prod_yp > best_sum;
			
 
				+		best_vector = select(best_vector, sum_yp, mask);
			
 
				+		best_sum = select(best_sum, prod_yp, mask);
			
 
				+
			
 
				+		mask = prod_zp > best_sum;
			
 
				+		best_vector = select(best_vector, sum_zp, mask);
			
 
				+		best_sum = select(best_sum, prod_zp, mask);
			
 
				+
			
 
				+		mask = prod_wp > best_sum;
			
 
				+		best_vector = select(best_vector, sum_wp, mask);
			
 
				+
			
 
				+		pm[partition].dir = best_vector;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_avgs_and_dirs_3_comp(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int omitted_component,
			
 
				+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
			
 
				+) {
			
 
				+	// Pre-compute partition_averages
			
 
				+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
			
 
				+	compute_partition_averages_rgba(pi, blk, partition_averages);
			
 
				+
			
 
				+	const float* data_vr = blk.data_r;
			
 
				+	const float* data_vg = blk.data_g;
			
 
				+	const float* data_vb = blk.data_b;
			
 
				+
			
 
				+	// TODO: Data-driven permute would be useful to avoid this ...
			
 
				+	if (omitted_component == 0)
			
 
				+	{
			
 
				+		partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
			
 
				+		partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
			
 
				+		partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
			
 
				+		partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
			
 
				+
			
 
				+		data_vr = blk.data_g;
			
 
				+		data_vg = blk.data_b;
			
 
				+		data_vb = blk.data_a;
			
 
				+	}
			
 
				+	else if (omitted_component == 1)
			
 
				+	{
			
 
				+		partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
			
 
				+		partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
			
 
				+		partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
			
 
				+		partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
			
 
				+
			
 
				+		data_vg = blk.data_b;
			
 
				+		data_vb = blk.data_a;
			
 
				+	}
			
 
				+	else if (omitted_component == 2)
			
 
				+	{
			
 
				+		partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
			
 
				+		partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
			
 
				+		partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
			
 
				+		partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
			
 
				+
			
 
				+		data_vb = blk.data_a;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
			
 
				+		partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
			
 
				+		partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
			
 
				+		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
			
 
				+	}
			
 
				+
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	{
			
 
				+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				+		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		promise(texel_count > 0);
			
 
				+
			
 
				+		vfloat4 average = partition_averages[partition];
			
 
				+		pm[partition].avg = average;
			
 
				+
			
 
				+		vfloat4 sum_xp = vfloat4::zero();
			
 
				+		vfloat4 sum_yp = vfloat4::zero();
			
 
				+		vfloat4 sum_zp = vfloat4::zero();
			
 
				+
			
 
				+		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		{
			
 
				+			unsigned int iwt = texel_indexes[i];
			
 
				+
			
 
				+			vfloat4 texel_datum = vfloat3(data_vr[iwt],
			
 
				+			                              data_vg[iwt],
			
 
				+			                              data_vb[iwt]);
			
 
				+			texel_datum = texel_datum - average;
			
 
				+
			
 
				+			vfloat4 zero = vfloat4::zero();
			
 
				+
			
 
				+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
			
 
				+			sum_xp += select(zero, texel_datum, tdm0);
			
 
				+
			
 
				+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
			
 
				+			sum_yp += select(zero, texel_datum, tdm1);
			
 
				+
			
 
				+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
			
 
				+			sum_zp += select(zero, texel_datum, tdm2);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
			
 
				+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
			
 
				+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
			
 
				+
			
 
				+		vfloat4 best_vector = sum_xp;
			
 
				+		vfloat4 best_sum = prod_xp;
			
 
				+
			
 
				+		vmask4 mask = prod_yp > best_sum;
			
 
				+		best_vector = select(best_vector, sum_yp, mask);
			
 
				+		best_sum = select(best_sum, prod_yp, mask);
			
 
				+
			
 
				+		mask = prod_zp > best_sum;
			
 
				+		best_vector = select(best_vector, sum_zp, mask);
			
 
				+
			
 
				+		pm[partition].dir = best_vector;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_avgs_and_dirs_3_comp_rgb(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
			
 
				+) {
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	// Pre-compute partition_averages
			
 
				+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
			
 
				+	compute_partition_averages_rgb(pi, blk, partition_averages);
			
 
				+
			
 
				+	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	{
			
 
				+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				+		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		promise(texel_count > 0);
			
 
				+
			
 
				+		vfloat4 average = partition_averages[partition];
			
 
				+		pm[partition].avg = average;
			
 
				+
			
 
				+		vfloat4 sum_xp = vfloat4::zero();
			
 
				+		vfloat4 sum_yp = vfloat4::zero();
			
 
				+		vfloat4 sum_zp = vfloat4::zero();
			
 
				+
			
 
				+		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		{
			
 
				+			unsigned int iwt = texel_indexes[i];
			
 
				+
			
 
				+			vfloat4 texel_datum = blk.texel3(iwt);
			
 
				+			texel_datum = texel_datum - average;
			
 
				+
			
 
				+			vfloat4 zero = vfloat4::zero();
			
 
				+
			
 
				+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
			
 
				+			sum_xp += select(zero, texel_datum, tdm0);
			
 
				+
			
 
				+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
			
 
				+			sum_yp += select(zero, texel_datum, tdm1);
			
 
				+
			
 
				+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
			
 
				+			sum_zp += select(zero, texel_datum, tdm2);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
			
 
				+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
			
 
				+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
			
 
				+
			
 
				+		vfloat4 best_vector = sum_xp;
			
 
				+		vfloat4 best_sum = prod_xp;
			
 
				+
			
 
				+		vmask4 mask = prod_yp > best_sum;
			
 
				+		best_vector = select(best_vector, sum_yp, mask);
			
 
				+		best_sum = select(best_sum, prod_yp, mask);
			
 
				+
			
 
				+		mask = prod_zp > best_sum;
			
 
				+		best_vector = select(best_vector, sum_zp, mask);
			
 
				+
			
 
				+		pm[partition].dir = best_vector;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_avgs_and_dirs_2_comp(
			
 
				+	const partition_info& pt,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int component1,
			
 
				+	unsigned int component2,
			
 
				+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
			
 
				+) {
			
 
				+	vfloat4 average;
			
 
				+
			
 
				+	const float* data_vr = nullptr;
			
 
				+	const float* data_vg = nullptr;
			
 
				+
			
 
				+	if (component1 == 0 && component2 == 1)
			
 
				+	{
			
 
				+		average = blk.data_mean.swz<0, 1>();
			
 
				+
			
 
				+		data_vr = blk.data_r;
			
 
				+		data_vg = blk.data_g;
			
 
				+	}
			
 
				+	else if (component1 == 0 && component2 == 2)
			
 
				+	{
			
 
				+		average = blk.data_mean.swz<0, 2>();
			
 
				+
			
 
				+		data_vr = blk.data_r;
			
 
				+		data_vg = blk.data_b;
			
 
				+	}
			
 
				+	else // (component1 == 1 && component2 == 2)
			
 
				+	{
			
 
				+		assert(component1 == 1 && component2 == 2);
			
 
				+
			
 
				+		average = blk.data_mean.swz<1, 2>();
			
 
				+
			
 
				+		data_vr = blk.data_g;
			
 
				+		data_vg = blk.data_b;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int partition_count = pt.partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	{
			
 
				+		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
			
 
				+		unsigned int texel_count = pt.partition_texel_count[partition];
			
 
				+		promise(texel_count > 0);
			
 
				+
			
 
				+		// Only compute a partition mean if more than one partition
			
 
				+		if (partition_count > 1)
			
 
				+		{
			
 
				+			average = vfloat4::zero();
			
 
				+			for (unsigned int i = 0; i < texel_count; i++)
			
 
				+			{
			
 
				+				unsigned int iwt = texel_indexes[i];
			
 
				+				average += vfloat2(data_vr[iwt], data_vg[iwt]);
			
 
				+			}
			
 
				+
			
 
				+			average = average / static_cast<float>(texel_count);
			
 
				+		}
			
 
				+
			
 
				+		pm[partition].avg = average;
			
 
				+
			
 
				+		vfloat4 sum_xp = vfloat4::zero();
			
 
				+		vfloat4 sum_yp = vfloat4::zero();
			
 
				+
			
 
				+		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		{
			
 
				+			unsigned int iwt = texel_indexes[i];
			
 
				+			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
			
 
				+			texel_datum = texel_datum - average;
			
 
				+
			
 
				+			vfloat4 zero = vfloat4::zero();
			
 
				+
			
 
				+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
			
 
				+			sum_xp += select(zero, texel_datum, tdm0);
			
 
				+
			
 
				+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
			
 
				+			sum_yp += select(zero, texel_datum, tdm1);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
			
 
				+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
			
 
				+
			
 
				+		vfloat4 best_vector = sum_xp;
			
 
				+		vfloat4 best_sum = prod_xp;
			
 
				+
			
 
				+		vmask4 mask = prod_yp > best_sum;
			
 
				+		best_vector = select(best_vector, sum_yp, mask);
			
 
				+
			
 
				+		pm[partition].dir = best_vector;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_error_squared_rgba(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
			
 
				+	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
			
 
				+	float uncor_lengths[BLOCK_MAX_PARTITIONS],
			
 
				+	float samec_lengths[BLOCK_MAX_PARTITIONS],
			
 
				+	float& uncor_error,
			
 
				+	float& samec_error
			
 
				+) {
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	vfloatacc uncor_errorsumv = vfloatacc::zero();
			
 
				+	vfloatacc samec_errorsumv = vfloatacc::zero();
			
 
				+
			
 
				+	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	{
			
 
				+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				+
			
 
				+		float uncor_loparam = 1e10f;
			
 
				+		float uncor_hiparam = -1e10f;
			
 
				+
			
 
				+		float samec_loparam = 1e10f;
			
 
				+		float samec_hiparam = -1e10f;
			
 
				+
			
 
				+		processed_line4 l_uncor = uncor_plines[partition];
			
 
				+		processed_line4 l_samec = samec_plines[partition];
			
 
				+
			
 
				+		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		promise(texel_count > 0);
			
 
				+
			
 
				+		// Vectorize some useful scalar inputs
			
 
				+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
			
 
				+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
			
 
				+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
			
 
				+		vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
			
 
				+
			
 
				+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
			
 
				+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
			
 
				+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
			
 
				+		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
			
 
				+
			
 
				+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
			
 
				+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
			
 
				+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
			
 
				+		vfloat l_samec_bs3(l_samec.bs.lane<3>());
			
 
				+
			
 
				+		assert(all(l_samec.amod == vfloat4(0.0f)));
			
 
				+
			
 
				+		vfloat uncor_loparamv(1e10f);
			
 
				+		vfloat uncor_hiparamv(-1e10f);
			
 
				+
			
 
				+		vfloat samec_loparamv(1e10f);
			
 
				+		vfloat samec_hiparamv(-1e10f);
			
 
				+
			
 
				+		vfloat ew_r(blk.channel_weight.lane<0>());
			
 
				+		vfloat ew_g(blk.channel_weight.lane<1>());
			
 
				+		vfloat ew_b(blk.channel_weight.lane<2>());
			
 
				+		vfloat ew_a(blk.channel_weight.lane<3>());
			
 
				+
			
 
				+		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
			
 
				+		// array to extend the last value. This means min/max are not impacted, but we need to mask
			
 
				+		// out the dummy values when we compute the line weighting.
			
 
				+		vint lane_ids = vint::lane_id();
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vmask mask = lane_ids < vint(texel_count);
			
 
				+			vint texel_idxs(texel_indexes + i);
			
 
				+
			
 
				+			vfloat data_r = gatherf(blk.data_r, texel_idxs);
			
 
				+			vfloat data_g = gatherf(blk.data_g, texel_idxs);
			
 
				+			vfloat data_b = gatherf(blk.data_b, texel_idxs);
			
 
				+			vfloat data_a = gatherf(blk.data_a, texel_idxs);
			
 
				+
			
 
				+			vfloat uncor_param = (data_r * l_uncor_bs0)
			
 
				+			                   + (data_g * l_uncor_bs1)
			
 
				+			                   + (data_b * l_uncor_bs2)
			
 
				+			                   + (data_a * l_uncor_bs3);
			
 
				+
			
 
				+			uncor_loparamv = min(uncor_param, uncor_loparamv);
			
 
				+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
			
 
				+
			
 
				+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
			
 
				+			                   + (uncor_param * l_uncor_bs0);
			
 
				+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
			
 
				+			                   + (uncor_param * l_uncor_bs1);
			
 
				+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
			
 
				+			                   + (uncor_param * l_uncor_bs2);
			
 
				+			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
			
 
				+			                   + (uncor_param * l_uncor_bs3);
			
 
				+
			
 
				+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
			
 
				+			                 + (ew_g * uncor_dist1 * uncor_dist1)
			
 
				+			                 + (ew_b * uncor_dist2 * uncor_dist2)
			
 
				+			                 + (ew_a * uncor_dist3 * uncor_dist3);
			
 
				+
			
 
				+			haccumulate(uncor_errorsumv, uncor_err, mask);
			
 
				+
			
 
				+			// Process samechroma data
			
 
				+			vfloat samec_param = (data_r * l_samec_bs0)
			
 
				+			                   + (data_g * l_samec_bs1)
			
 
				+			                   + (data_b * l_samec_bs2)
			
 
				+			                   + (data_a * l_samec_bs3);
			
 
				+
			
 
				+			samec_loparamv = min(samec_param, samec_loparamv);
			
 
				+			samec_hiparamv = max(samec_param, samec_hiparamv);
			
 
				+
			
 
				+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
			
 
				+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
			
 
				+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
			
 
				+			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
			
 
				+
			
 
				+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
			
 
				+			                 + (ew_g * samec_dist1 * samec_dist1)
			
 
				+			                 + (ew_b * samec_dist2 * samec_dist2)
			
 
				+			                 + (ew_a * samec_dist3 * samec_dist3);
			
 
				+
			
 
				+			haccumulate(samec_errorsumv, samec_err, mask);
			
 
				+
			
 
				+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
			
 
				+		}
			
 
				+
			
 
				+		uncor_loparam = hmin_s(uncor_loparamv);
			
 
				+		uncor_hiparam = hmax_s(uncor_hiparamv);
			
 
				+
			
 
				+		samec_loparam = hmin_s(samec_loparamv);
			
 
				+		samec_hiparam = hmax_s(samec_hiparamv);
			
 
				+
			
 
				+		float uncor_linelen = uncor_hiparam - uncor_loparam;
			
 
				+		float samec_linelen = samec_hiparam - samec_loparam;
			
 
				+
			
 
				+		// Turn very small numbers and NaNs into a small number
			
 
				+		uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
			
 
				+		samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
			
 
				+	}
			
 
				+
			
 
				+	uncor_error = hadd_s(uncor_errorsumv);
			
 
				+	samec_error = hadd_s(samec_errorsumv);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_error_squared_rgb(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
			
 
				+	float& uncor_error,
			
 
				+	float& samec_error
			
 
				+) {
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	vfloatacc uncor_errorsumv = vfloatacc::zero();
			
 
				+	vfloatacc samec_errorsumv = vfloatacc::zero();
			
 
				+
			
 
				+	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	{
			
 
				+		partition_lines3& pl = plines[partition];
			
 
				+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				+		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		promise(texel_count > 0);
			
 
				+
			
 
				+		float uncor_loparam = 1e10f;
			
 
				+		float uncor_hiparam = -1e10f;
			
 
				+
			
 
				+		float samec_loparam = 1e10f;
			
 
				+		float samec_hiparam = -1e10f;
			
 
				+
			
 
				+		processed_line3 l_uncor = pl.uncor_pline;
			
 
				+		processed_line3 l_samec = pl.samec_pline;
			
 
				+
			
 
				+		// This implementation is an example vectorization of this function.
			
 
				+		// It works for - the codec is a 2-4% faster than not vectorizing - but
			
 
				+		// the benefit is limited by the use of gathers and register pressure
			
 
				+
			
 
				+		// Vectorize some useful scalar inputs
			
 
				+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
			
 
				+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
			
 
				+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
			
 
				+
			
 
				+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
			
 
				+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
			
 
				+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
			
 
				+
			
 
				+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
			
 
				+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
			
 
				+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
			
 
				+
			
 
				+		assert(all(l_samec.amod == vfloat4(0.0f)));
			
 
				+
			
 
				+		vfloat uncor_loparamv(1e10f);
			
 
				+		vfloat uncor_hiparamv(-1e10f);
			
 
				+
			
 
				+		vfloat samec_loparamv(1e10f);
			
 
				+		vfloat samec_hiparamv(-1e10f);
			
 
				+
			
 
				+		vfloat ew_r(blk.channel_weight.lane<0>());
			
 
				+		vfloat ew_g(blk.channel_weight.lane<1>());
			
 
				+		vfloat ew_b(blk.channel_weight.lane<2>());
			
 
				+
			
 
				+		// This implementation over-shoots, but this is safe as we initialize the weights array
			
 
				+		// to extend the last value. This means min/max are not impacted, but we need to mask
			
 
				+		// out the dummy values when we compute the line weighting.
			
 
				+		vint lane_ids = vint::lane_id();
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vmask mask = lane_ids < vint(texel_count);
			
 
				+			vint texel_idxs(texel_indexes + i);
			
 
				+
			
 
				+			vfloat data_r = gatherf(blk.data_r, texel_idxs);
			
 
				+			vfloat data_g = gatherf(blk.data_g, texel_idxs);
			
 
				+			vfloat data_b = gatherf(blk.data_b, texel_idxs);
			
 
				+
			
 
				+			vfloat uncor_param = (data_r * l_uncor_bs0)
			
 
				+			                   + (data_g * l_uncor_bs1)
			
 
				+			                   + (data_b * l_uncor_bs2);
			
 
				+
			
 
				+			uncor_loparamv = min(uncor_param, uncor_loparamv);
			
 
				+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
			
 
				+
			
 
				+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
			
 
				+			                   + (uncor_param * l_uncor_bs0);
			
 
				+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
			
 
				+			                   + (uncor_param * l_uncor_bs1);
			
 
				+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
			
 
				+			                   + (uncor_param * l_uncor_bs2);
			
 
				+
			
 
				+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
			
 
				+			                 + (ew_g * uncor_dist1 * uncor_dist1)
			
 
				+			                 + (ew_b * uncor_dist2 * uncor_dist2);
			
 
				+
			
 
				+			haccumulate(uncor_errorsumv, uncor_err, mask);
			
 
				+
			
 
				+			// Process samechroma data
			
 
				+			vfloat samec_param = (data_r * l_samec_bs0)
			
 
				+			                   + (data_g * l_samec_bs1)
			
 
				+			                   + (data_b * l_samec_bs2);
			
 
				+
			
 
				+			samec_loparamv = min(samec_param, samec_loparamv);
			
 
				+			samec_hiparamv = max(samec_param, samec_hiparamv);
			
 
				+
			
 
				+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
			
 
				+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
			
 
				+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
			
 
				+
			
 
				+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
			
 
				+			                 + (ew_g * samec_dist1 * samec_dist1)
			
 
				+			                 + (ew_b * samec_dist2 * samec_dist2);
			
 
				+
			
 
				+			haccumulate(samec_errorsumv, samec_err, mask);
			
 
				+
			
 
				+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
			
 
				+		}
			
 
				+
			
 
				+		uncor_loparam = hmin_s(uncor_loparamv);
			
 
				+		uncor_hiparam = hmax_s(uncor_hiparamv);
			
 
				+
			
 
				+		samec_loparam = hmin_s(samec_loparamv);
			
 
				+		samec_hiparam = hmax_s(samec_hiparamv);
			
 
				+
			
 
				+		float uncor_linelen = uncor_hiparam - uncor_loparam;
			
 
				+		float samec_linelen = samec_hiparam - samec_loparam;
			
 
				+
			
 
				+		// Turn very small numbers and NaNs into a small number
			
 
				+		pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
			
 
				+		pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
			
 
				+	}
			
 
				+
			
 
				+	uncor_error = hadd_s(uncor_errorsumv);
			
 
				+	samec_error = hadd_s(samec_errorsumv);
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_block_sizes.cpp
+++ b/thirdparty/astcenc/astcenc_block_sizes.cpp
@@ -0,0 +1,1184 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions to generate block size descriptor and decimation tables.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Decode the properties of an encoded 2D block mode.
			
 
				+ *
			
 
				+ * @param      block_mode      The encoded block mode.
			
 
				+ * @param[out] x_weights       The number of weights in the X dimension.
			
 
				+ * @param[out] y_weights       The number of weights in the Y dimension.
			
 
				+ * @param[out] is_dual_plane   True if this block mode has two weight planes.
			
 
				+ * @param[out] quant_mode      The quantization level for the weights.
			
 
				+ * @param[out] weight_bits     The storage bit count for the weights.
			
 
				+ *
			
 
				+ * @return Returns true if a valid mode, false otherwise.
			
 
				+ */
			
 
				+static bool decode_block_mode_2d(
			
 
				+	unsigned int block_mode,
			
 
				+	unsigned int& x_weights,
			
 
				+	unsigned int& y_weights,
			
 
				+	bool& is_dual_plane,
			
 
				+	unsigned int& quant_mode,
			
 
				+	unsigned int& weight_bits
			
 
				+) {
			
 
				+	unsigned int base_quant_mode = (block_mode >> 4) & 1;
			
 
				+	unsigned int H = (block_mode >> 9) & 1;
			
 
				+	unsigned int D = (block_mode >> 10) & 1;
			
 
				+	unsigned int A = (block_mode >> 5) & 0x3;
			
 
				+
			
 
				+	x_weights = 0;
			
 
				+	y_weights = 0;
			
 
				+
			
 
				+	if ((block_mode & 3) != 0)
			
 
				+	{
			
 
				+		base_quant_mode |= (block_mode & 3) << 1;
			
 
				+		unsigned int B = (block_mode >> 7) & 3;
			
 
				+		switch ((block_mode >> 2) & 3)
			
 
				+		{
			
 
				+		case 0:
			
 
				+			x_weights = B + 4;
			
 
				+			y_weights = A + 2;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+			x_weights = B + 8;
			
 
				+			y_weights = A + 2;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+			x_weights = A + 2;
			
 
				+			y_weights = B + 8;
			
 
				+			break;
			
 
				+		case 3:
			
 
				+			B &= 1;
			
 
				+			if (block_mode & 0x100)
			
 
				+			{
			
 
				+				x_weights = B + 2;
			
 
				+				y_weights = A + 2;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				x_weights = A + 2;
			
 
				+				y_weights = B + 6;
			
 
				+			}
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		base_quant_mode |= ((block_mode >> 2) & 3) << 1;
			
 
				+		if (((block_mode >> 2) & 3) == 0)
			
 
				+		{
			
 
				+			return false;
			
 
				+		}
			
 
				+
			
 
				+		unsigned int B = (block_mode >> 9) & 3;
			
 
				+		switch ((block_mode >> 7) & 3)
			
 
				+		{
			
 
				+		case 0:
			
 
				+			x_weights = 12;
			
 
				+			y_weights = A + 2;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+			x_weights = A + 2;
			
 
				+			y_weights = 12;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+			x_weights = A + 6;
			
 
				+			y_weights = B + 6;
			
 
				+			D = 0;
			
 
				+			H = 0;
			
 
				+			break;
			
 
				+		case 3:
			
 
				+			switch ((block_mode >> 5) & 3)
			
 
				+			{
			
 
				+			case 0:
			
 
				+				x_weights = 6;
			
 
				+				y_weights = 10;
			
 
				+				break;
			
 
				+			case 1:
			
 
				+				x_weights = 10;
			
 
				+				y_weights = 6;
			
 
				+				break;
			
 
				+			case 2:
			
 
				+			case 3:
			
 
				+				return false;
			
 
				+			}
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	unsigned int weight_count = x_weights * y_weights * (D + 1);
			
 
				+	quant_mode = (base_quant_mode - 2) + 6 * H;
			
 
				+	is_dual_plane = D != 0;
			
 
				+
			
 
				+	weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
			
 
				+	return (weight_count <= BLOCK_MAX_WEIGHTS &&
			
 
				+	        weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
			
 
				+	        weight_bits <= BLOCK_MAX_WEIGHT_BITS);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Decode the properties of an encoded 3D block mode.
			
 
				+ *
			
 
				+ * @param      block_mode      The encoded block mode.
			
 
				+ * @param[out] x_weights       The number of weights in the X dimension.
			
 
				+ * @param[out] y_weights       The number of weights in the Y dimension.
			
 
				+ * @param[out] z_weights       The number of weights in the Z dimension.
			
 
				+ * @param[out] is_dual_plane   True if this block mode has two weight planes.
			
 
				+ * @param[out] quant_mode      The quantization level for the weights.
			
 
				+ * @param[out] weight_bits     The storage bit count for the weights.
			
 
				+ *
			
 
				+ * @return Returns true if a valid mode, false otherwise.
			
 
				+ */
			
 
				+static bool decode_block_mode_3d(
			
 
				+	unsigned int block_mode,
			
 
				+	unsigned int& x_weights,
			
 
				+	unsigned int& y_weights,
			
 
				+	unsigned int& z_weights,
			
 
				+	bool& is_dual_plane,
			
 
				+	unsigned int& quant_mode,
			
 
				+	unsigned int& weight_bits
			
 
				+) {
			
 
				+	unsigned int base_quant_mode = (block_mode >> 4) & 1;
			
 
				+	unsigned int H = (block_mode >> 9) & 1;
			
 
				+	unsigned int D = (block_mode >> 10) & 1;
			
 
				+	unsigned int A = (block_mode >> 5) & 0x3;
			
 
				+
			
 
				+	x_weights = 0;
			
 
				+	y_weights = 0;
			
 
				+	z_weights = 0;
			
 
				+
			
 
				+	if ((block_mode & 3) != 0)
			
 
				+	{
			
 
				+		base_quant_mode |= (block_mode & 3) << 1;
			
 
				+		unsigned int B = (block_mode >> 7) & 3;
			
 
				+		unsigned int C = (block_mode >> 2) & 0x3;
			
 
				+		x_weights = A + 2;
			
 
				+		y_weights = B + 2;
			
 
				+		z_weights = C + 2;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		base_quant_mode |= ((block_mode >> 2) & 3) << 1;
			
 
				+		if (((block_mode >> 2) & 3) == 0)
			
 
				+		{
			
 
				+			return false;
			
 
				+		}
			
 
				+
			
 
				+		int B = (block_mode >> 9) & 3;
			
 
				+		if (((block_mode >> 7) & 3) != 3)
			
 
				+		{
			
 
				+			D = 0;
			
 
				+			H = 0;
			
 
				+		}
			
 
				+		switch ((block_mode >> 7) & 3)
			
 
				+		{
			
 
				+		case 0:
			
 
				+			x_weights = 6;
			
 
				+			y_weights = B + 2;
			
 
				+			z_weights = A + 2;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+			x_weights = A + 2;
			
 
				+			y_weights = 6;
			
 
				+			z_weights = B + 2;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+			x_weights = A + 2;
			
 
				+			y_weights = B + 2;
			
 
				+			z_weights = 6;
			
 
				+			break;
			
 
				+		case 3:
			
 
				+			x_weights = 2;
			
 
				+			y_weights = 2;
			
 
				+			z_weights = 2;
			
 
				+			switch ((block_mode >> 5) & 3)
			
 
				+			{
			
 
				+			case 0:
			
 
				+				x_weights = 6;
			
 
				+				break;
			
 
				+			case 1:
			
 
				+				y_weights = 6;
			
 
				+				break;
			
 
				+			case 2:
			
 
				+				z_weights = 6;
			
 
				+				break;
			
 
				+			case 3:
			
 
				+				return false;
			
 
				+			}
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	unsigned int weight_count = x_weights * y_weights * z_weights * (D + 1);
			
 
				+	quant_mode = (base_quant_mode - 2) + 6 * H;
			
 
				+	is_dual_plane = D != 0;
			
 
				+
			
 
				+	weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
			
 
				+	return (weight_count <= BLOCK_MAX_WEIGHTS &&
			
 
				+	        weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
			
 
				+	        weight_bits <= BLOCK_MAX_WEIGHT_BITS);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Create a 2D decimation entry for a block-size and weight-decimation pair.
			
 
				+ *
			
 
				+ * @param      x_texels    The number of texels in the X dimension.
			
 
				+ * @param      y_texels    The number of texels in the Y dimension.
			
 
				+ * @param      x_weights   The number of weights in the X dimension.
			
 
				+ * @param      y_weights   The number of weights in the Y dimension.
			
 
				+ * @param[out] di          The decimation info structure to populate.
			
 
				+ * @param[out] wb          The decimation table init scratch working buffers.
			
 
				+ */
			
 
				+static void init_decimation_info_2d(
			
 
				+	unsigned int x_texels,
			
 
				+	unsigned int y_texels,
			
 
				+	unsigned int x_weights,
			
 
				+	unsigned int y_weights,
			
 
				+	decimation_info& di,
			
 
				+	dt_init_working_buffers& wb
			
 
				+) {
			
 
				+	unsigned int texels_per_block = x_texels * y_texels;
			
 
				+	unsigned int weights_per_block = x_weights * y_weights;
			
 
				+
			
 
				+	uint8_t max_texel_count_of_weight = 0;
			
 
				+
			
 
				+	promise(weights_per_block > 0);
			
 
				+	promise(texels_per_block > 0);
			
 
				+	promise(x_texels > 0);
			
 
				+	promise(y_texels > 0);
			
 
				+
			
 
				+	for (unsigned int i = 0; i < weights_per_block; i++)
			
 
				+	{
			
 
				+		wb.texel_count_of_weight[i] = 0;
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int i = 0; i < texels_per_block; i++)
			
 
				+	{
			
 
				+		wb.weight_count_of_texel[i] = 0;
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int y = 0; y < y_texels; y++)
			
 
				+	{
			
 
				+		for (unsigned int x = 0; x < x_texels; x++)
			
 
				+		{
			
 
				+			unsigned int texel = y * x_texels + x;
			
 
				+
			
 
				+			unsigned int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6;
			
 
				+			unsigned int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6;
			
 
				+
			
 
				+			unsigned int x_weight_frac = x_weight & 0xF;
			
 
				+			unsigned int y_weight_frac = y_weight & 0xF;
			
 
				+			unsigned int x_weight_int = x_weight >> 4;
			
 
				+			unsigned int y_weight_int = y_weight >> 4;
			
 
				+
			
 
				+			unsigned int qweight[4];
			
 
				+			qweight[0] = x_weight_int + y_weight_int * x_weights;
			
 
				+			qweight[1] = qweight[0] + 1;
			
 
				+			qweight[2] = qweight[0] + x_weights;
			
 
				+			qweight[3] = qweight[2] + 1;
			
 
				+
			
 
				+			// Truncated-precision bilinear interpolation
			
 
				+			unsigned int prod = x_weight_frac * y_weight_frac;
			
 
				+
			
 
				+			unsigned int weight[4];
			
 
				+			weight[3] = (prod + 8) >> 4;
			
 
				+			weight[1] = x_weight_frac - weight[3];
			
 
				+			weight[2] = y_weight_frac - weight[3];
			
 
				+			weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3];
			
 
				+
			
 
				+			for (unsigned int i = 0; i < 4; i++)
			
 
				+			{
			
 
				+				if (weight[i] != 0)
			
 
				+				{
			
 
				+					wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
			
 
				+					wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
			
 
				+					wb.weight_count_of_texel[texel]++;
			
 
				+					wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
			
 
				+					wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
			
 
				+					wb.texel_count_of_weight[qweight[i]]++;
			
 
				+					max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	uint8_t max_texel_weight_count = 0;
			
 
				+	for (unsigned int i = 0; i < texels_per_block; i++)
			
 
				+	{
			
 
				+		di.texel_weight_count[i] = wb.weight_count_of_texel[i];
			
 
				+		max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
			
 
				+
			
 
				+		for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
			
 
				+		{
			
 
				+			di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
			
 
				+			di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
			
 
				+			di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
			
 
				+		}
			
 
				+
			
 
				+		// Init all 4 entries so we can rely on zeros for vectorization
			
 
				+		for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++)
			
 
				+		{
			
 
				+			di.texel_weight_contribs_int_tr[j][i] = 0;
			
 
				+			di.texel_weight_contribs_float_tr[j][i] = 0.0f;
			
 
				+			di.texel_weights_tr[j][i] = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	di.max_texel_weight_count = max_texel_weight_count;
			
 
				+
			
 
				+	for (unsigned int i = 0; i < weights_per_block; i++)
			
 
				+	{
			
 
				+		unsigned int texel_count_wt = wb.texel_count_of_weight[i];
			
 
				+		di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
			
 
				+
			
 
				+		for (unsigned int j = 0; j < texel_count_wt; j++)
			
 
				+		{
			
 
				+			uint8_t texel = wb.texels_of_weight[i][j];
			
 
				+
			
 
				+			// Create transposed versions of these for better vectorization
			
 
				+			di.weight_texels_tr[j][i] = texel;
			
 
				+			di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
			
 
				+
			
 
				+			// Store the per-texel contribution of this weight for each texel it contributes to
			
 
				+			di.texel_contrib_for_weight[j][i] = 0.0f;
			
 
				+			for (unsigned int k = 0; k < 4; k++)
			
 
				+			{
			
 
				+				uint8_t dttw = di.texel_weights_tr[k][texel];
			
 
				+				float dttwf = di.texel_weight_contribs_float_tr[k][texel];
			
 
				+				if (dttw == i && dttwf != 0.0f)
			
 
				+				{
			
 
				+					di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
			
 
				+		// Match last texel in active lane in SIMD group, for better gathers
			
 
				+		uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
			
 
				+		for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
			
 
				+		{
			
 
				+			di.weight_texels_tr[j][i] = last_texel;
			
 
				+			di.weights_texel_contribs_tr[j][i] = 0.0f;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
			
 
				+	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
			
 
				+	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
			
 
				+	{
			
 
				+		di.texel_weight_count[i] = 0;
			
 
				+
			
 
				+		for (unsigned int j = 0; j < 4; j++)
			
 
				+		{
			
 
				+			di.texel_weight_contribs_float_tr[j][i] = 0;
			
 
				+			di.texel_weights_tr[j][i] = 0;
			
 
				+			di.texel_weight_contribs_int_tr[j][i] = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
			
 
				+	// Match last texel in active lane in SIMD group, for better gathers
			
 
				+	unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
			
 
				+	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
			
 
				+
			
 
				+	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
			
 
				+	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
			
 
				+	{
			
 
				+		di.weight_texel_count[i] = 0;
			
 
				+
			
 
				+		for (unsigned int j = 0; j < max_texel_count_of_weight; j++)
			
 
				+		{
			
 
				+			di.weight_texels_tr[j][i] = last_texel;
			
 
				+			di.weights_texel_contribs_tr[j][i] = 0.0f;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	di.texel_count = static_cast<uint8_t>(texels_per_block);
			
 
				+	di.weight_count = static_cast<uint8_t>(weights_per_block);
			
 
				+	di.weight_x = static_cast<uint8_t>(x_weights);
			
 
				+	di.weight_y = static_cast<uint8_t>(y_weights);
			
 
				+	di.weight_z = 1;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Create a 3D decimation entry for a block-size and weight-decimation pair.
			
 
				+ *
			
 
				+ * @param      x_texels    The number of texels in the X dimension.
			
 
				+ * @param      y_texels    The number of texels in the Y dimension.
			
 
				+ * @param      z_texels    The number of texels in the Z dimension.
			
 
				+ * @param      x_weights   The number of weights in the X dimension.
			
 
				+ * @param      y_weights   The number of weights in the Y dimension.
			
 
				+ * @param      z_weights   The number of weights in the Z dimension.
			
 
				+ * @param[out] di          The decimation info structure to populate.
			
 
				+   @param[out] wb          The decimation table init scratch working buffers.
			
 
				+ */
			
 
				+static void init_decimation_info_3d(
			
 
				+	unsigned int x_texels,
			
 
				+	unsigned int y_texels,
			
 
				+	unsigned int z_texels,
			
 
				+	unsigned int x_weights,
			
 
				+	unsigned int y_weights,
			
 
				+	unsigned int z_weights,
			
 
				+	decimation_info& di,
			
 
				+	dt_init_working_buffers& wb
			
 
				+) {
			
 
				+	unsigned int texels_per_block = x_texels * y_texels * z_texels;
			
 
				+	unsigned int weights_per_block = x_weights * y_weights * z_weights;
			
 
				+
			
 
				+	uint8_t max_texel_count_of_weight = 0;
			
 
				+
			
 
				+	promise(weights_per_block > 0);
			
 
				+	promise(texels_per_block > 0);
			
 
				+
			
 
				+	for (unsigned int i = 0; i < weights_per_block; i++)
			
 
				+	{
			
 
				+		wb.texel_count_of_weight[i] = 0;
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int i = 0; i < texels_per_block; i++)
			
 
				+	{
			
 
				+		wb.weight_count_of_texel[i] = 0;
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int z = 0; z < z_texels; z++)
			
 
				+	{
			
 
				+		for (unsigned int y = 0; y < y_texels; y++)
			
 
				+		{
			
 
				+			for (unsigned int x = 0; x < x_texels; x++)
			
 
				+			{
			
 
				+				int texel = (z * y_texels + y) * x_texels + x;
			
 
				+
			
 
				+				int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6;
			
 
				+				int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6;
			
 
				+				int z_weight = (((1024 + z_texels / 2) / (z_texels - 1)) * z * (z_weights - 1) + 32) >> 6;
			
 
				+
			
 
				+				int x_weight_frac = x_weight & 0xF;
			
 
				+				int y_weight_frac = y_weight & 0xF;
			
 
				+				int z_weight_frac = z_weight & 0xF;
			
 
				+				int x_weight_int = x_weight >> 4;
			
 
				+				int y_weight_int = y_weight >> 4;
			
 
				+				int z_weight_int = z_weight >> 4;
			
 
				+				int qweight[4];
			
 
				+				int weight[4];
			
 
				+				qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int;
			
 
				+				qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1);
			
 
				+
			
 
				+				// simplex interpolation
			
 
				+				int fs = x_weight_frac;
			
 
				+				int ft = y_weight_frac;
			
 
				+				int fp = z_weight_frac;
			
 
				+
			
 
				+				int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp));
			
 
				+				int N = x_weights;
			
 
				+				int NM = x_weights * y_weights;
			
 
				+
			
 
				+				int s1, s2, w0, w1, w2, w3;
			
 
				+				switch (cas)
			
 
				+				{
			
 
				+				case 7:
			
 
				+					s1 = 1;
			
 
				+					s2 = N;
			
 
				+					w0 = 16 - fs;
			
 
				+					w1 = fs - ft;
			
 
				+					w2 = ft - fp;
			
 
				+					w3 = fp;
			
 
				+					break;
			
 
				+				case 3:
			
 
				+					s1 = N;
			
 
				+					s2 = 1;
			
 
				+					w0 = 16 - ft;
			
 
				+					w1 = ft - fs;
			
 
				+					w2 = fs - fp;
			
 
				+					w3 = fp;
			
 
				+					break;
			
 
				+				case 5:
			
 
				+					s1 = 1;
			
 
				+					s2 = NM;
			
 
				+					w0 = 16 - fs;
			
 
				+					w1 = fs - fp;
			
 
				+					w2 = fp - ft;
			
 
				+					w3 = ft;
			
 
				+					break;
			
 
				+				case 4:
			
 
				+					s1 = NM;
			
 
				+					s2 = 1;
			
 
				+					w0 = 16 - fp;
			
 
				+					w1 = fp - fs;
			
 
				+					w2 = fs - ft;
			
 
				+					w3 = ft;
			
 
				+					break;
			
 
				+				case 2:
			
 
				+					s1 = N;
			
 
				+					s2 = NM;
			
 
				+					w0 = 16 - ft;
			
 
				+					w1 = ft - fp;
			
 
				+					w2 = fp - fs;
			
 
				+					w3 = fs;
			
 
				+					break;
			
 
				+				case 0:
			
 
				+					s1 = NM;
			
 
				+					s2 = N;
			
 
				+					w0 = 16 - fp;
			
 
				+					w1 = fp - ft;
			
 
				+					w2 = ft - fs;
			
 
				+					w3 = fs;
			
 
				+					break;
			
 
				+				default:
			
 
				+					s1 = NM;
			
 
				+					s2 = N;
			
 
				+					w0 = 16 - fp;
			
 
				+					w1 = fp - ft;
			
 
				+					w2 = ft - fs;
			
 
				+					w3 = fs;
			
 
				+					break;
			
 
				+				}
			
 
				+
			
 
				+				qweight[1] = qweight[0] + s1;
			
 
				+				qweight[2] = qweight[1] + s2;
			
 
				+				weight[0] = w0;
			
 
				+				weight[1] = w1;
			
 
				+				weight[2] = w2;
			
 
				+				weight[3] = w3;
			
 
				+
			
 
				+				for (unsigned int i = 0; i < 4; i++)
			
 
				+				{
			
 
				+					if (weight[i] != 0)
			
 
				+					{
			
 
				+						wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
			
 
				+						wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
			
 
				+						wb.weight_count_of_texel[texel]++;
			
 
				+						wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
			
 
				+						wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
			
 
				+						wb.texel_count_of_weight[qweight[i]]++;
			
 
				+						max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	uint8_t max_texel_weight_count = 0;
			
 
				+	for (unsigned int i = 0; i < texels_per_block; i++)
			
 
				+	{
			
 
				+		di.texel_weight_count[i] = wb.weight_count_of_texel[i];
			
 
				+		max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
			
 
				+
			
 
				+		// Init all 4 entries so we can rely on zeros for vectorization
			
 
				+		for (unsigned int j = 0; j < 4; j++)
			
 
				+		{
			
 
				+			di.texel_weight_contribs_int_tr[j][i] = 0;
			
 
				+			di.texel_weight_contribs_float_tr[j][i] = 0.0f;
			
 
				+			di.texel_weights_tr[j][i] = 0;
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
			
 
				+		{
			
 
				+			di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
			
 
				+			di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
			
 
				+			di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	di.max_texel_weight_count = max_texel_weight_count;
			
 
				+
			
 
				+	for (unsigned int i = 0; i < weights_per_block; i++)
			
 
				+	{
			
 
				+		unsigned int texel_count_wt = wb.texel_count_of_weight[i];
			
 
				+		di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
			
 
				+
			
 
				+		for (unsigned int j = 0; j < texel_count_wt; j++)
			
 
				+		{
			
 
				+			unsigned int texel = wb.texels_of_weight[i][j];
			
 
				+
			
 
				+			// Create transposed versions of these for better vectorization
			
 
				+			di.weight_texels_tr[j][i] = static_cast<uint8_t>(texel);
			
 
				+			di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
			
 
				+
			
 
				+			// Store the per-texel contribution of this weight for each texel it contributes to
			
 
				+			di.texel_contrib_for_weight[j][i] = 0.0f;
			
 
				+			for (unsigned int k = 0; k < 4; k++)
			
 
				+			{
			
 
				+				uint8_t dttw = di.texel_weights_tr[k][texel];
			
 
				+				float dttwf = di.texel_weight_contribs_float_tr[k][texel];
			
 
				+				if (dttw == i && dttwf != 0.0f)
			
 
				+				{
			
 
				+					di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
			
 
				+		// Match last texel in active lane in SIMD group, for better gathers
			
 
				+		uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
			
 
				+		for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
			
 
				+		{
			
 
				+			di.weight_texels_tr[j][i] = last_texel;
			
 
				+			di.weights_texel_contribs_tr[j][i] = 0.0f;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
			
 
				+	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
			
 
				+	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
			
 
				+	{
			
 
				+		di.texel_weight_count[i] = 0;
			
 
				+
			
 
				+		for (unsigned int j = 0; j < 4; j++)
			
 
				+		{
			
 
				+			di.texel_weight_contribs_float_tr[j][i] = 0;
			
 
				+			di.texel_weights_tr[j][i] = 0;
			
 
				+			di.texel_weight_contribs_int_tr[j][i] = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
			
 
				+	// Match last texel in active lane in SIMD group, for better gathers
			
 
				+	int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
			
 
				+	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
			
 
				+
			
 
				+	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
			
 
				+	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
			
 
				+	{
			
 
				+		di.weight_texel_count[i] = 0;
			
 
				+
			
 
				+		for (int j = 0; j < max_texel_count_of_weight; j++)
			
 
				+		{
			
 
				+			di.weight_texels_tr[j][i] = last_texel;
			
 
				+			di.weights_texel_contribs_tr[j][i] = 0.0f;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	di.texel_count = static_cast<uint8_t>(texels_per_block);
			
 
				+	di.weight_count = static_cast<uint8_t>(weights_per_block);
			
 
				+	di.weight_x = static_cast<uint8_t>(x_weights);
			
 
				+	di.weight_y = static_cast<uint8_t>(y_weights);
			
 
				+	di.weight_z = static_cast<uint8_t>(z_weights);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Assign the texels to use for kmeans clustering.
			
 
				+ *
			
 
				+ * The max limit is @c BLOCK_MAX_KMEANS_TEXELS; above this a random selection is used.
			
 
				+ * The @c bsd.texel_count is an input and must be populated beforehand.
			
 
				+ *
			
 
				+ * @param[in,out] bsd   The block size descriptor to populate.
			
 
				+ */
			
 
				+static void assign_kmeans_texels(
			
 
				+	block_size_descriptor& bsd
			
 
				+) {
			
 
				+	// Use all texels for kmeans on a small block
			
 
				+	if (bsd.texel_count <= BLOCK_MAX_KMEANS_TEXELS)
			
 
				+	{
			
 
				+		for (uint8_t i = 0; i < bsd.texel_count; i++)
			
 
				+		{
			
 
				+			bsd.kmeans_texels[i] = i;
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Select a random subset of BLOCK_MAX_KMEANS_TEXELS for kmeans on a large block
			
 
				+	uint64_t rng_state[2];
			
 
				+	astc::rand_init(rng_state);
			
 
				+
			
 
				+	// Initialize array used for tracking used indices
			
 
				+	bool seen[BLOCK_MAX_TEXELS];
			
 
				+	for (uint8_t i = 0; i < bsd.texel_count; i++)
			
 
				+	{
			
 
				+		seen[i] = false;
			
 
				+	}
			
 
				+
			
 
				+	// Assign 64 random indices, retrying if we see repeats
			
 
				+	unsigned int arr_elements_set = 0;
			
 
				+	while (arr_elements_set < BLOCK_MAX_KMEANS_TEXELS)
			
 
				+	{
			
 
				+		uint8_t texel = static_cast<uint8_t>(astc::rand(rng_state));
			
 
				+		texel = texel % bsd.texel_count;
			
 
				+		if (!seen[texel])
			
 
				+		{
			
 
				+			bsd.kmeans_texels[arr_elements_set++] = texel;
			
 
				+			seen[texel] = true;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Allocate a single 2D decimation table entry.
			
 
				+ *
			
 
				+ * @param x_texels    The number of texels in the X dimension.
			
 
				+ * @param y_texels    The number of texels in the Y dimension.
			
 
				+ * @param x_weights   The number of weights in the X dimension.
			
 
				+ * @param y_weights   The number of weights in the Y dimension.
			
 
				+ * @param bsd         The block size descriptor we are populating.
			
 
				+ * @param wb          The decimation table init scratch working buffers.
			
 
				+ * @param index       The packed array index to populate.
			
 
				+ */
			
 
				+static void construct_dt_entry_2d(
			
 
				+	unsigned int x_texels,
			
 
				+	unsigned int y_texels,
			
 
				+	unsigned int x_weights,
			
 
				+	unsigned int y_weights,
			
 
				+	block_size_descriptor& bsd,
			
 
				+	dt_init_working_buffers& wb,
			
 
				+	unsigned int index
			
 
				+) {
			
 
				+	unsigned int weight_count = x_weights * y_weights;
			
 
				+	assert(weight_count <= BLOCK_MAX_WEIGHTS);
			
 
				+
			
 
				+	bool try_2planes = (2 * weight_count) <= BLOCK_MAX_WEIGHTS;
			
 
				+
			
 
				+	decimation_info& di = bsd.decimation_tables[index];
			
 
				+	init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, di, wb);
			
 
				+
			
 
				+	int maxprec_1plane = -1;
			
 
				+	int maxprec_2planes = -1;
			
 
				+	for (int i = 0; i < 12; i++)
			
 
				+	{
			
 
				+		unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
			
 
				+		if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
			
 
				+		{
			
 
				+			maxprec_1plane = i;
			
 
				+		}
			
 
				+
			
 
				+		if (try_2planes)
			
 
				+		{
			
 
				+			unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i));
			
 
				+			if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
			
 
				+			{
			
 
				+				maxprec_2planes = i;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// At least one of the two should be valid ...
			
 
				+	assert(maxprec_1plane >= 0 || maxprec_2planes >= 0);
			
 
				+	bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
			
 
				+	bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
			
 
				+	bsd.decimation_modes[index].refprec_1_plane = 0;
			
 
				+	bsd.decimation_modes[index].refprec_2_planes = 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Allocate block modes and decimation tables for a single 2D block size.
			
 
				+ *
			
 
				+ * @param      x_texels         The number of texels in the X dimension.
			
 
				+ * @param      y_texels         The number of texels in the Y dimension.
			
 
				+ * @param      can_omit_modes   Can we discard modes that astcenc won't use, even if legal?
			
 
				+ * @param      mode_cutoff      Percentile cutoff in range [0,1]. Low values more likely to be used.
			
 
				+ * @param[out] bsd              The block size descriptor to populate.
			
 
				+ */
			
 
				+static void construct_block_size_descriptor_2d(
			
 
				+	unsigned int x_texels,
			
 
				+	unsigned int y_texels,
			
 
				+	bool can_omit_modes,
			
 
				+	float mode_cutoff,
			
 
				+	block_size_descriptor& bsd
			
 
				+) {
			
 
				+	// Store a remap table for storing packed decimation modes.
			
 
				+	// Indexing uses [Y * 16 + X] and max size for each axis is 12.
			
 
				+	static const unsigned int MAX_DMI = 12 * 16 + 12;
			
 
				+	int decimation_mode_index[MAX_DMI];
			
 
				+
			
 
				+	dt_init_working_buffers* wb = new dt_init_working_buffers;
			
 
				+
			
 
				+	bsd.xdim = static_cast<uint8_t>(x_texels);
			
 
				+	bsd.ydim = static_cast<uint8_t>(y_texels);
			
 
				+	bsd.zdim = 1;
			
 
				+	bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels);
			
 
				+
			
 
				+	for (unsigned int i = 0; i < MAX_DMI; i++)
			
 
				+	{
			
 
				+		decimation_mode_index[i] = -1;
			
 
				+	}
			
 
				+
			
 
				+	// Gather all the decimation grids that can be used with the current block
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	const float *percentiles = get_2d_percentile_table(x_texels, y_texels);
			
 
				+	float always_cutoff = 0.0f;
			
 
				+#else
			
 
				+	// Unused in decompress-only builds
			
 
				+	(void)can_omit_modes;
			
 
				+	(void)mode_cutoff;
			
 
				+#endif
			
 
				+
			
 
				+	// Construct the list of block formats referencing the decimation tables
			
 
				+	unsigned int packed_bm_idx = 0;
			
 
				+	unsigned int packed_dm_idx = 0;
			
 
				+
			
 
				+	// Trackers
			
 
				+	unsigned int bm_counts[4] { 0 };
			
 
				+	unsigned int dm_counts[4] { 0 };
			
 
				+
			
 
				+	// Clear the list to a known-bad value
			
 
				+	for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
			
 
				+	{
			
 
				+		bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
			
 
				+	}
			
 
				+
			
 
				+	// Iterate four times to build a usefully ordered list:
			
 
				+	//   - Pass 0 - keep selected single plane "always" block modes
			
 
				+	//   - Pass 1 - keep selected single plane "non-always" block modes
			
 
				+	//   - Pass 2 - keep select dual plane block modes
			
 
				+	//   - Pass 3 - keep everything else that's legal
			
 
				+	unsigned int limit = can_omit_modes ? 3 : 4;
			
 
				+	for (unsigned int j = 0; j < limit; j ++)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
			
 
				+		{
			
 
				+			// Skip modes we've already included in a previous pass
			
 
				+			if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			// Decode parameters
			
 
				+			unsigned int x_weights;
			
 
				+			unsigned int y_weights;
			
 
				+			bool is_dual_plane;
			
 
				+			unsigned int quant_mode;
			
 
				+			unsigned int weight_bits;
			
 
				+			bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits);
			
 
				+
			
 
				+			// Always skip invalid encodings for the current block size
			
 
				+			if (!valid || (x_weights > x_texels) || (y_weights > y_texels))
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			// Selectively skip dual plane encodings
			
 
				+			if (((j <= 1) && is_dual_plane) || (j == 2 && !is_dual_plane))
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			// Always skip encodings we can't physically encode based on
			
 
				+			// generic encoding bit availability
			
 
				+			if (is_dual_plane)
			
 
				+			{
			
 
				+				 // This is the only check we need as only support 1 partition
			
 
				+				 if ((109 - weight_bits) <= 0)
			
 
				+				 {
			
 
				+					continue;
			
 
				+				 }
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// This is conservative - fewer bits may be available for > 1 partition
			
 
				+				 if ((111 - weight_bits) <= 0)
			
 
				+				 {
			
 
				+					continue;
			
 
				+				 }
			
 
				+			}
			
 
				+
			
 
				+			// Selectively skip encodings based on percentile
			
 
				+			bool percentile_hit = false;
			
 
				+	#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+			if (j == 0)
			
 
				+			{
			
 
				+				percentile_hit = percentiles[i] <= always_cutoff;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				percentile_hit = percentiles[i] <= mode_cutoff;
			
 
				+			}
			
 
				+	#endif
			
 
				+
			
 
				+			if (j != 3 && !percentile_hit)
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			// Allocate and initialize the decimation table entry if we've not used it yet
			
 
				+			int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
			
 
				+			if (decimation_mode < 0)
			
 
				+			{
			
 
				+				construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb, packed_dm_idx);
			
 
				+				decimation_mode_index[y_weights * 16 + x_weights] = packed_dm_idx;
			
 
				+				decimation_mode = packed_dm_idx;
			
 
				+
			
 
				+				dm_counts[j]++;
			
 
				+				packed_dm_idx++;
			
 
				+			}
			
 
				+
			
 
				+			auto& bm = bsd.block_modes[packed_bm_idx];
			
 
				+
			
 
				+			bm.decimation_mode = static_cast<uint8_t>(decimation_mode);
			
 
				+			bm.quant_mode = static_cast<uint8_t>(quant_mode);
			
 
				+			bm.is_dual_plane = static_cast<uint8_t>(is_dual_plane);
			
 
				+			bm.weight_bits = static_cast<uint8_t>(weight_bits);
			
 
				+			bm.mode_index = static_cast<uint16_t>(i);
			
 
				+
			
 
				+			auto& dm = bsd.decimation_modes[decimation_mode];
			
 
				+
			
 
				+			if (is_dual_plane)
			
 
				+			{
			
 
				+				dm.set_ref_2_plane(bm.get_weight_quant_mode());
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				dm.set_ref_1_plane(bm.get_weight_quant_mode());
			
 
				+			}
			
 
				+
			
 
				+			bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx);
			
 
				+
			
 
				+			packed_bm_idx++;
			
 
				+			bm_counts[j]++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	bsd.block_mode_count_1plane_always = bm_counts[0];
			
 
				+	bsd.block_mode_count_1plane_selected = bm_counts[0] + bm_counts[1];
			
 
				+	bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1] + bm_counts[2];
			
 
				+	bsd.block_mode_count_all = bm_counts[0] + bm_counts[1] + bm_counts[2] + bm_counts[3];
			
 
				+
			
 
				+	bsd.decimation_mode_count_always = dm_counts[0];
			
 
				+	bsd.decimation_mode_count_selected = dm_counts[0] + dm_counts[1] + dm_counts[2];
			
 
				+	bsd.decimation_mode_count_all = dm_counts[0] + dm_counts[1] + dm_counts[2] + dm_counts[3];
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	assert(bsd.block_mode_count_1plane_always > 0);
			
 
				+	assert(bsd.decimation_mode_count_always > 0);
			
 
				+
			
 
				+	delete[] percentiles;
			
 
				+#endif
			
 
				+
			
 
				+	// Ensure the end of the array contains valid data (should never get read)
			
 
				+	for (unsigned int i = bsd.decimation_mode_count_all; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
			
 
				+	{
			
 
				+		bsd.decimation_modes[i].maxprec_1plane = -1;
			
 
				+		bsd.decimation_modes[i].maxprec_2planes = -1;
			
 
				+		bsd.decimation_modes[i].refprec_1_plane = 0;
			
 
				+		bsd.decimation_modes[i].refprec_2_planes = 0;
			
 
				+	}
			
 
				+
			
 
				+	// Determine the texels to use for kmeans clustering.
			
 
				+	assign_kmeans_texels(bsd);
			
 
				+
			
 
				+	delete wb;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Allocate block modes and decimation tables for a single 3D block size.
			
 
				+ *
			
 
				+ * TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as
			
 
				+ * the percentile mode cutoffs. If 3D becomes more widely used we should look at this.
			
 
				+ *
			
 
				+ * @param      x_texels   The number of texels in the X dimension.
			
 
				+ * @param      y_texels   The number of texels in the Y dimension.
			
 
				+ * @param      z_texels   The number of texels in the Z dimension.
			
 
				+ * @param[out] bsd        The block size descriptor to populate.
			
 
				+ */
			
 
				+static void construct_block_size_descriptor_3d(
			
 
				+	unsigned int x_texels,
			
 
				+	unsigned int y_texels,
			
 
				+	unsigned int z_texels,
			
 
				+	block_size_descriptor& bsd
			
 
				+) {
			
 
				+	// Store a remap table for storing packed decimation modes.
			
 
				+	// Indexing uses [Z * 64 + Y *  8 + X] and max size for each axis is 6.
			
 
				+	static constexpr unsigned int MAX_DMI = 6 * 64 + 6 * 8 + 6;
			
 
				+	int decimation_mode_index[MAX_DMI];
			
 
				+	unsigned int decimation_mode_count = 0;
			
 
				+
			
 
				+	dt_init_working_buffers* wb = new dt_init_working_buffers;
			
 
				+
			
 
				+	bsd.xdim = static_cast<uint8_t>(x_texels);
			
 
				+	bsd.ydim = static_cast<uint8_t>(y_texels);
			
 
				+	bsd.zdim = static_cast<uint8_t>(z_texels);
			
 
				+	bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels * z_texels);
			
 
				+
			
 
				+	for (unsigned int i = 0; i < MAX_DMI; i++)
			
 
				+	{
			
 
				+		decimation_mode_index[i] = -1;
			
 
				+	}
			
 
				+
			
 
				+	// gather all the infill-modes that can be used with the current block size
			
 
				+	for (unsigned int x_weights = 2; x_weights <= x_texels; x_weights++)
			
 
				+	{
			
 
				+		for (unsigned int y_weights = 2; y_weights <= y_texels; y_weights++)
			
 
				+		{
			
 
				+			for (unsigned int z_weights = 2; z_weights <= z_texels; z_weights++)
			
 
				+			{
			
 
				+				unsigned int weight_count = x_weights * y_weights * z_weights;
			
 
				+				if (weight_count > BLOCK_MAX_WEIGHTS)
			
 
				+				{
			
 
				+					continue;
			
 
				+				}
			
 
				+
			
 
				+				decimation_info& di = bsd.decimation_tables[decimation_mode_count];
			
 
				+				decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count;
			
 
				+				init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, di, *wb);
			
 
				+
			
 
				+				int maxprec_1plane = -1;
			
 
				+				int maxprec_2planes = -1;
			
 
				+				for (unsigned int i = 0; i < 12; i++)
			
 
				+				{
			
 
				+					unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
			
 
				+					if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
			
 
				+					{
			
 
				+						maxprec_1plane = i;
			
 
				+					}
			
 
				+
			
 
				+					unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i));
			
 
				+					if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
			
 
				+					{
			
 
				+						maxprec_2planes = i;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				if ((2 * weight_count) > BLOCK_MAX_WEIGHTS)
			
 
				+				{
			
 
				+					maxprec_2planes = -1;
			
 
				+				}
			
 
				+
			
 
				+				bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
			
 
				+				bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
			
 
				+				bsd.decimation_modes[decimation_mode_count].refprec_1_plane = maxprec_1plane == -1 ? 0 : 0xFFFF;
			
 
				+				bsd.decimation_modes[decimation_mode_count].refprec_2_planes = maxprec_2planes == -1 ? 0 : 0xFFFF;
			
 
				+				decimation_mode_count++;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Ensure the end of the array contains valid data (should never get read)
			
 
				+	for (unsigned int i = decimation_mode_count; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
			
 
				+	{
			
 
				+		bsd.decimation_modes[i].maxprec_1plane = -1;
			
 
				+		bsd.decimation_modes[i].maxprec_2planes = -1;
			
 
				+		bsd.decimation_modes[i].refprec_1_plane = 0;
			
 
				+		bsd.decimation_modes[i].refprec_2_planes = 0;
			
 
				+	}
			
 
				+
			
 
				+	bsd.decimation_mode_count_always = 0; // Skipped for 3D modes
			
 
				+	bsd.decimation_mode_count_selected = decimation_mode_count;
			
 
				+	bsd.decimation_mode_count_all = decimation_mode_count;
			
 
				+
			
 
				+	// Construct the list of block formats referencing the decimation tables
			
 
				+
			
 
				+	// Clear the list to a known-bad value
			
 
				+	for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
			
 
				+	{
			
 
				+		bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int packed_idx = 0;
			
 
				+	unsigned int bm_counts[2] { 0 };
			
 
				+
			
 
				+	// Iterate two times to build a usefully ordered list:
			
 
				+	//   - Pass 0 - keep valid single plane block modes
			
 
				+	//   - Pass 1 - keep valid dual plane block modes
			
 
				+	for (unsigned int j = 0; j < 2; j++)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
			
 
				+		{
			
 
				+			// Skip modes we've already included in a previous pass
			
 
				+			if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			unsigned int x_weights;
			
 
				+			unsigned int y_weights;
			
 
				+			unsigned int z_weights;
			
 
				+			bool is_dual_plane;
			
 
				+			unsigned int quant_mode;
			
 
				+			unsigned int weight_bits;
			
 
				+
			
 
				+			bool valid = decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits);
			
 
				+			// Skip invalid encodings
			
 
				+			if (!valid || x_weights > x_texels || y_weights > y_texels || z_weights > z_texels)
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			// Skip encodings in the wrong iteration
			
 
				+			if ((j == 0 && is_dual_plane) || (j == 1 && !is_dual_plane))
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			// Always skip encodings we can't physically encode based on bit availability
			
 
				+			if (is_dual_plane)
			
 
				+			{
			
 
				+				 // This is the only check we need as only support 1 partition
			
 
				+				 if ((109 - weight_bits) <= 0)
			
 
				+				 {
			
 
				+					continue;
			
 
				+				 }
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// This is conservative - fewer bits may be available for > 1 partition
			
 
				+				 if ((111 - weight_bits) <= 0)
			
 
				+				 {
			
 
				+					continue;
			
 
				+				 }
			
 
				+			}
			
 
				+
			
 
				+			int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights];
			
 
				+			bsd.block_modes[packed_idx].decimation_mode = static_cast<uint8_t>(decimation_mode);
			
 
				+			bsd.block_modes[packed_idx].quant_mode = static_cast<uint8_t>(quant_mode);
			
 
				+			bsd.block_modes[packed_idx].weight_bits = static_cast<uint8_t>(weight_bits);
			
 
				+			bsd.block_modes[packed_idx].is_dual_plane = static_cast<uint8_t>(is_dual_plane);
			
 
				+			bsd.block_modes[packed_idx].mode_index = static_cast<uint16_t>(i);
			
 
				+
			
 
				+			bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_idx);
			
 
				+			bm_counts[j]++;
			
 
				+			packed_idx++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	bsd.block_mode_count_1plane_always = 0;  // Skipped for 3D modes
			
 
				+	bsd.block_mode_count_1plane_selected = bm_counts[0];
			
 
				+	bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1];
			
 
				+	bsd.block_mode_count_all = bm_counts[0] + bm_counts[1];
			
 
				+
			
 
				+	// Determine the texels to use for kmeans clustering.
			
 
				+	assign_kmeans_texels(bsd);
			
 
				+
			
 
				+	delete wb;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void init_block_size_descriptor(
			
 
				+	unsigned int x_texels,
			
 
				+	unsigned int y_texels,
			
 
				+	unsigned int z_texels,
			
 
				+	bool can_omit_modes,
			
 
				+	unsigned int partition_count_cutoff,
			
 
				+	float mode_cutoff,
			
 
				+	block_size_descriptor& bsd
			
 
				+) {
			
 
				+	if (z_texels > 1)
			
 
				+	{
			
 
				+		construct_block_size_descriptor_3d(x_texels, y_texels, z_texels, bsd);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		construct_block_size_descriptor_2d(x_texels, y_texels, can_omit_modes, mode_cutoff, bsd);
			
 
				+	}
			
 
				+
			
 
				+	init_partition_tables(bsd, can_omit_modes, partition_count_cutoff);
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_color_quantize.cpp
+++ b/thirdparty/astcenc/astcenc_color_quantize.cpp
@@ -0,0 +1,2071 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for color quantization.
			
 
				+ *
			
 
				+ * The design of the color quantization functionality requires the caller to use higher level error
			
 
				+ * analysis to determine the base encoding that should be used. This earlier analysis will select
			
 
				+ * the basic type of the endpoint that should be used:
			
 
				+ *
			
 
				+ *     * Mode: LDR or HDR
			
 
				+ *     * Quantization level
			
 
				+ *     * Channel count: L, LA, RGB, or RGBA
			
 
				+ *     * Endpoint 2 type: Direct color endcode, or scaled from endpoint 1.
			
 
				+ *
			
 
				+ * However, this leaves a number of decisions about exactly how to pack the endpoints open. In
			
 
				+ * particular we need to determine if blue contraction can be used, or/and if delta encoding can be
			
 
				+ * used. If they can be applied these will allow us to maintain higher precision in the endpoints
			
 
				+ * without needing additional storage.
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Determine the quantized value given a quantization level.
			
 
				+ *
			
 
				+ * @param quant_level   The quantization level to use.
			
 
				+ * @param value         The value to convert. This may be outside of the 0-255 range and will be
			
 
				+ *                      clamped before the value is looked up.
			
 
				+ *
			
 
				+ * @return The encoded quantized value. These are not necessarily in order; the compressor
			
 
				+ *         scrambles the values slightly to make hardware implementation easier.
			
 
				+ */
			
 
				+static inline uint8_t quant_color(
			
 
				+	quant_method quant_level,
			
 
				+	int value
			
 
				+) {
			
 
				+	return color_unquant_to_uquant_tables[quant_level - QUANT_6][value];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize an LDR RGB color.
			
 
				+ *
			
 
				+ * Since this is a fall-back encoding, we cannot actually fail but must produce a sensible result.
			
 
				+ * For this encoding @c color0 cannot be larger than @c color1. If @c color0 is actually larger
			
 
				+ * than @c color1, @c color0 is reduced and @c color1 is increased until the constraint is met.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (r0, r1, g0, g1, b0, b1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_rgb(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[6],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float r0 = astc::clamp255f(color0.lane<0>() * scale);
			
 
				+	float g0 = astc::clamp255f(color0.lane<1>() * scale);
			
 
				+	float b0 = astc::clamp255f(color0.lane<2>() * scale);
			
 
				+
			
 
				+	float r1 = astc::clamp255f(color1.lane<0>() * scale);
			
 
				+	float g1 = astc::clamp255f(color1.lane<1>() * scale);
			
 
				+	float b1 = astc::clamp255f(color1.lane<2>() * scale);
			
 
				+
			
 
				+	int ri0, gi0, bi0, ri1, gi1, bi1;
			
 
				+	float rgb0_addon = 0.5f;
			
 
				+	float rgb1_addon = 0.5f;
			
 
				+	do
			
 
				+	{
			
 
				+		ri0 = quant_color(quant_level, astc::max(astc::flt2int_rd(r0 + rgb0_addon), 0));
			
 
				+		gi0 = quant_color(quant_level, astc::max(astc::flt2int_rd(g0 + rgb0_addon), 0));
			
 
				+		bi0 = quant_color(quant_level, astc::max(astc::flt2int_rd(b0 + rgb0_addon), 0));
			
 
				+		ri1 = quant_color(quant_level, astc::min(astc::flt2int_rd(r1 + rgb1_addon), 255));
			
 
				+		gi1 = quant_color(quant_level, astc::min(astc::flt2int_rd(g1 + rgb1_addon), 255));
			
 
				+		bi1 = quant_color(quant_level, astc::min(astc::flt2int_rd(b1 + rgb1_addon), 255));
			
 
				+
			
 
				+		rgb0_addon -= 0.2f;
			
 
				+		rgb1_addon += 0.2f;
			
 
				+	} while (ri0 + gi0 + bi0 > ri1 + gi1 + bi1);
			
 
				+
			
 
				+	output[0] = static_cast<uint8_t>(ri0);
			
 
				+	output[1] = static_cast<uint8_t>(ri1);
			
 
				+	output[2] = static_cast<uint8_t>(gi0);
			
 
				+	output[3] = static_cast<uint8_t>(gi1);
			
 
				+	output[4] = static_cast<uint8_t>(bi0);
			
 
				+	output[5] = static_cast<uint8_t>(bi1);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize an LDR RGBA color.
			
 
				+ *
			
 
				+ * Since this is a fall-back encoding, we cannot actually fail but must produce a sensible result.
			
 
				+ * For this encoding @c color0.rgb cannot be larger than @c color1.rgb (this indicates blue
			
 
				+ * contraction). If @c color0.rgb is actually larger than @c color1.rgb, @c color0.rgb is reduced
			
 
				+ * and @c color1.rgb is increased until the constraint is met.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (r0, r1, g0, g1, b0, b1, a0, a1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_rgba(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[8],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
			
 
				+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
			
 
				+
			
 
				+	output[6] = quant_color(quant_level, astc::flt2int_rtn(a0));
			
 
				+	output[7] = quant_color(quant_level, astc::flt2int_rtn(a1));
			
 
				+
			
 
				+	quantize_rgb(color0, color1, output, quant_level);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Try to quantize an LDR RGB color using blue-contraction.
			
 
				+ *
			
 
				+ * Blue-contraction is only usable if encoded color 1 is larger than color 0.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (r1, r0, g1, g0, b1, b0).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ *
			
 
				+ * @return Returns @c false on failure, @c true on success.
			
 
				+ */
			
 
				+static bool try_quantize_rgb_blue_contract(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[6],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float r0 = color0.lane<0>() * scale;
			
 
				+	float g0 = color0.lane<1>() * scale;
			
 
				+	float b0 = color0.lane<2>() * scale;
			
 
				+
			
 
				+	float r1 = color1.lane<0>() * scale;
			
 
				+	float g1 = color1.lane<1>() * scale;
			
 
				+	float b1 = color1.lane<2>() * scale;
			
 
				+
			
 
				+	// Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used.
			
 
				+	r0 += (r0 - b0);
			
 
				+	g0 += (g0 - b0);
			
 
				+	r1 += (r1 - b1);
			
 
				+	g1 += (g1 - b1);
			
 
				+
			
 
				+	if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
			
 
				+		r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	// Quantize the inverse-blue-contracted color
			
 
				+	int ri0 = quant_color(quant_level, astc::flt2int_rtn(r0));
			
 
				+	int gi0 = quant_color(quant_level, astc::flt2int_rtn(g0));
			
 
				+	int bi0 = quant_color(quant_level, astc::flt2int_rtn(b0));
			
 
				+
			
 
				+	int ri1 = quant_color(quant_level, astc::flt2int_rtn(r1));
			
 
				+	int gi1 = quant_color(quant_level, astc::flt2int_rtn(g1));
			
 
				+	int bi1 = quant_color(quant_level, astc::flt2int_rtn(b1));
			
 
				+
			
 
				+	// If color #1 is not larger than color #0 then blue-contraction cannot be used. Note that
			
 
				+	// blue-contraction and quantization change this order, which is why we must test afterwards.
			
 
				+	if (ri1 + gi1 + bi1 <= ri0 + gi0 + bi0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	output[0] = static_cast<uint8_t>(ri1);
			
 
				+	output[1] = static_cast<uint8_t>(ri0);
			
 
				+	output[2] = static_cast<uint8_t>(gi1);
			
 
				+	output[3] = static_cast<uint8_t>(gi0);
			
 
				+	output[4] = static_cast<uint8_t>(bi1);
			
 
				+	output[5] = static_cast<uint8_t>(bi0);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Try to quantize an LDR RGBA color using blue-contraction.
			
 
				+ *
			
 
				+ * Blue-contraction is only usable if encoded color 1 RGB is larger than color 0 RGB.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (r1, r0, g1, g0, b1, b0, a1, a0).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ *
			
 
				+ * @return Returns @c false on failure, @c true on success.
			
 
				+ */
			
 
				+static int try_quantize_rgba_blue_contract(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[8],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
			
 
				+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
			
 
				+
			
 
				+	output[6] = quant_color(quant_level, astc::flt2int_rtn(a1));
			
 
				+	output[7] = quant_color(quant_level, astc::flt2int_rtn(a0));
			
 
				+
			
 
				+	return try_quantize_rgb_blue_contract(color0, color1, output, quant_level);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Try to quantize an LDR RGB color using delta encoding.
			
 
				+ *
			
 
				+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
			
 
				+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
			
 
				+ * non-negative, then we encode a regular delta.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (r0, r1, g0, g1, b0, b1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ *
			
 
				+ * @return Returns @c false on failure, @c true on success.
			
 
				+ */
			
 
				+static bool try_quantize_rgb_delta(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[6],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float r0 = astc::clamp255f(color0.lane<0>() * scale);
			
 
				+	float g0 = astc::clamp255f(color0.lane<1>() * scale);
			
 
				+	float b0 = astc::clamp255f(color0.lane<2>() * scale);
			
 
				+
			
 
				+	float r1 = astc::clamp255f(color1.lane<0>() * scale);
			
 
				+	float g1 = astc::clamp255f(color1.lane<1>() * scale);
			
 
				+	float b1 = astc::clamp255f(color1.lane<2>() * scale);
			
 
				+
			
 
				+	// Transform r0 to unorm9
			
 
				+	int r0a = astc::flt2int_rtn(r0);
			
 
				+	int g0a = astc::flt2int_rtn(g0);
			
 
				+	int b0a = astc::flt2int_rtn(b0);
			
 
				+
			
 
				+	r0a <<= 1;
			
 
				+	g0a <<= 1;
			
 
				+	b0a <<= 1;
			
 
				+
			
 
				+	// Mask off the top bit
			
 
				+	int r0b = r0a & 0xFF;
			
 
				+	int g0b = g0a & 0xFF;
			
 
				+	int b0b = b0a & 0xFF;
			
 
				+
			
 
				+	// Quantize then unquantize in order to get a value that we take differences against
			
 
				+	int r0be = quant_color(quant_level, r0b);
			
 
				+	int g0be = quant_color(quant_level, g0b);
			
 
				+	int b0be = quant_color(quant_level, b0b);
			
 
				+
			
 
				+	r0b = r0be | (r0a & 0x100);
			
 
				+	g0b = g0be | (g0a & 0x100);
			
 
				+	b0b = b0be | (b0a & 0x100);
			
 
				+
			
 
				+	// Get hold of the second value
			
 
				+	int r1d = astc::flt2int_rtn(r1);
			
 
				+	int g1d = astc::flt2int_rtn(g1);
			
 
				+	int b1d = astc::flt2int_rtn(b1);
			
 
				+
			
 
				+	r1d <<= 1;
			
 
				+	g1d <<= 1;
			
 
				+	b1d <<= 1;
			
 
				+
			
 
				+	// ... and take differences
			
 
				+	r1d -= r0b;
			
 
				+	g1d -= g0b;
			
 
				+	b1d -= b0b;
			
 
				+
			
 
				+	// Check if the difference is too large to be encodable
			
 
				+	if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	// Insert top bit of the base into the offset
			
 
				+	r1d &= 0x7F;
			
 
				+	g1d &= 0x7F;
			
 
				+	b1d &= 0x7F;
			
 
				+
			
 
				+	r1d |= (r0b & 0x100) >> 1;
			
 
				+	g1d |= (g0b & 0x100) >> 1;
			
 
				+	b1d |= (b0b & 0x100) >> 1;
			
 
				+
			
 
				+	// Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails
			
 
				+	// since we have then corrupted either the top bit of the base or the sign bit of the offset
			
 
				+	int r1de = quant_color(quant_level, r1d);
			
 
				+	int g1de = quant_color(quant_level, g1d);
			
 
				+	int b1de = quant_color(quant_level, b1d);
			
 
				+
			
 
				+	if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	// If the sum of offsets triggers blue-contraction then encoding fails
			
 
				+	vint4 ep0(r0be, g0be, b0be, 0);
			
 
				+	vint4 ep1(r1de, g1de, b1de, 0);
			
 
				+	bit_transfer_signed(ep1, ep0);
			
 
				+	if (hadd_rgb_s(ep1) < 0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	// Check that the offsets produce legitimate sums as well
			
 
				+	ep0 = ep0 + ep1;
			
 
				+	if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	output[0] = static_cast<uint8_t>(r0be);
			
 
				+	output[1] = static_cast<uint8_t>(r1de);
			
 
				+	output[2] = static_cast<uint8_t>(g0be);
			
 
				+	output[3] = static_cast<uint8_t>(g1de);
			
 
				+	output[4] = static_cast<uint8_t>(b0be);
			
 
				+	output[5] = static_cast<uint8_t>(b1de);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static bool try_quantize_rgb_delta_blue_contract(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[6],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	// Note: Switch around endpoint colors already at start
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float r1 = color0.lane<0>() * scale;
			
 
				+	float g1 = color0.lane<1>() * scale;
			
 
				+	float b1 = color0.lane<2>() * scale;
			
 
				+
			
 
				+	float r0 = color1.lane<0>() * scale;
			
 
				+	float g0 = color1.lane<1>() * scale;
			
 
				+	float b0 = color1.lane<2>() * scale;
			
 
				+
			
 
				+	// Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used.
			
 
				+	r0 += (r0 - b0);
			
 
				+	g0 += (g0 - b0);
			
 
				+	r1 += (r1 - b1);
			
 
				+	g1 += (g1 - b1);
			
 
				+
			
 
				+	if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
			
 
				+	    r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	// Transform r0 to unorm9
			
 
				+	int r0a = astc::flt2int_rtn(r0);
			
 
				+	int g0a = astc::flt2int_rtn(g0);
			
 
				+	int b0a = astc::flt2int_rtn(b0);
			
 
				+	r0a <<= 1;
			
 
				+	g0a <<= 1;
			
 
				+	b0a <<= 1;
			
 
				+
			
 
				+	// Mask off the top bit
			
 
				+	int r0b = r0a & 0xFF;
			
 
				+	int g0b = g0a & 0xFF;
			
 
				+	int b0b = b0a & 0xFF;
			
 
				+
			
 
				+	// Quantize, then unquantize in order to get a value that we take differences against.
			
 
				+	int r0be = quant_color(quant_level, r0b);
			
 
				+	int g0be = quant_color(quant_level, g0b);
			
 
				+	int b0be = quant_color(quant_level, b0b);
			
 
				+
			
 
				+	r0b = r0be | (r0a & 0x100);
			
 
				+	g0b = g0be | (g0a & 0x100);
			
 
				+	b0b = b0be | (b0a & 0x100);
			
 
				+
			
 
				+	// Get hold of the second value
			
 
				+	int r1d = astc::flt2int_rtn(r1);
			
 
				+	int g1d = astc::flt2int_rtn(g1);
			
 
				+	int b1d = astc::flt2int_rtn(b1);
			
 
				+
			
 
				+	r1d <<= 1;
			
 
				+	g1d <<= 1;
			
 
				+	b1d <<= 1;
			
 
				+
			
 
				+	// .. and take differences!
			
 
				+	r1d -= r0b;
			
 
				+	g1d -= g0b;
			
 
				+	b1d -= b0b;
			
 
				+
			
 
				+	// Check if the difference is too large to be encodable
			
 
				+	if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	// Insert top bit of the base into the offset
			
 
				+	r1d &= 0x7F;
			
 
				+	g1d &= 0x7F;
			
 
				+	b1d &= 0x7F;
			
 
				+
			
 
				+	r1d |= (r0b & 0x100) >> 1;
			
 
				+	g1d |= (g0b & 0x100) >> 1;
			
 
				+	b1d |= (b0b & 0x100) >> 1;
			
 
				+
			
 
				+	// Then quantize and  unquantize; if this causes any of the top two bits to flip,
			
 
				+	// then encoding fails, since we have then corrupted either the top bit of the base
			
 
				+	// or the sign bit of the offset.
			
 
				+	int r1de = quant_color(quant_level, r1d);
			
 
				+	int g1de = quant_color(quant_level, g1d);
			
 
				+	int b1de = quant_color(quant_level, b1d);
			
 
				+
			
 
				+	if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	// If the sum of offsets does not trigger blue-contraction then encoding fails
			
 
				+	vint4 ep0(r0be, g0be, b0be, 0);
			
 
				+	vint4 ep1(r1de, g1de, b1de, 0);
			
 
				+	bit_transfer_signed(ep1, ep0);
			
 
				+	if (hadd_rgb_s(ep1) >= 0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	// Check that the offsets produce legitimate sums as well
			
 
				+	ep0 = ep0 + ep1;
			
 
				+	if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	output[0] = static_cast<uint8_t>(r0be);
			
 
				+	output[1] = static_cast<uint8_t>(r1de);
			
 
				+	output[2] = static_cast<uint8_t>(g0be);
			
 
				+	output[3] = static_cast<uint8_t>(g1de);
			
 
				+	output[4] = static_cast<uint8_t>(b0be);
			
 
				+	output[5] = static_cast<uint8_t>(b1de);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Try to quantize an LDR A color using delta encoding.
			
 
				+ *
			
 
				+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
			
 
				+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
			
 
				+ * non-negative, then we encode a regular delta.
			
 
				+ *
			
 
				+ * This function only compressed the alpha - the other elements in the output array are not touched.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (x, x, x, x, x, x, a0, a1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ *
			
 
				+ * @return Returns @c false on failure, @c true on success.
			
 
				+ */
			
 
				+static bool try_quantize_alpha_delta(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[8],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
			
 
				+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
			
 
				+
			
 
				+	int a0a = astc::flt2int_rtn(a0);
			
 
				+	a0a <<= 1;
			
 
				+	int a0b = a0a & 0xFF;
			
 
				+	int a0be = quant_color(quant_level, a0b);
			
 
				+	a0b = a0be;
			
 
				+	a0b |= a0a & 0x100;
			
 
				+	int a1d = astc::flt2int_rtn(a1);
			
 
				+	a1d <<= 1;
			
 
				+	a1d -= a0b;
			
 
				+
			
 
				+	if (a1d > 63 || a1d < -64)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	a1d &= 0x7F;
			
 
				+	a1d |= (a0b & 0x100) >> 1;
			
 
				+
			
 
				+	int a1de = quant_color(quant_level, a1d);
			
 
				+	int a1du = a1de;
			
 
				+	if ((a1d ^ a1du) & 0xC0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	a1du &= 0x7F;
			
 
				+	if (a1du & 0x40)
			
 
				+	{
			
 
				+		a1du -= 0x80;
			
 
				+	}
			
 
				+
			
 
				+	a1du += a0b;
			
 
				+	if (a1du < 0 || a1du > 0x1FF)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	output[6] = static_cast<uint8_t>(a0be);
			
 
				+	output[7] = static_cast<uint8_t>(a1de);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Try to quantize an LDR LA color using delta encoding.
			
 
				+ *
			
 
				+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
			
 
				+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
			
 
				+ * non-negative, then we encode a regular delta.
			
 
				+ *
			
 
				+ * This function only compressed the alpha - the other elements in the output array are not touched.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (l0, l1, a0, a1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ *
			
 
				+ * @return Returns @c false on failure, @c true on success.
			
 
				+ */
			
 
				+static bool try_quantize_luminance_alpha_delta(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[4],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float l0 = astc::clamp255f(hadd_rgb_s(color0) * ((1.0f / 3.0f) * scale));
			
 
				+	float l1 = astc::clamp255f(hadd_rgb_s(color1) * ((1.0f / 3.0f) * scale));
			
 
				+
			
 
				+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
			
 
				+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
			
 
				+
			
 
				+	int l0a = astc::flt2int_rtn(l0);
			
 
				+	int a0a = astc::flt2int_rtn(a0);
			
 
				+	l0a <<= 1;
			
 
				+	a0a <<= 1;
			
 
				+
			
 
				+	int l0b = l0a & 0xFF;
			
 
				+	int a0b = a0a & 0xFF;
			
 
				+	int l0be = quant_color(quant_level, l0b);
			
 
				+	int a0be = quant_color(quant_level, a0b);
			
 
				+	l0b = l0be;
			
 
				+	a0b = a0be;
			
 
				+	l0b |= l0a & 0x100;
			
 
				+	a0b |= a0a & 0x100;
			
 
				+
			
 
				+	int l1d = astc::flt2int_rtn(l1);
			
 
				+	int a1d = astc::flt2int_rtn(a1);
			
 
				+	l1d <<= 1;
			
 
				+	a1d <<= 1;
			
 
				+	l1d -= l0b;
			
 
				+	a1d -= a0b;
			
 
				+
			
 
				+	if (l1d > 63 || l1d < -64)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	if (a1d > 63 || a1d < -64)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	l1d &= 0x7F;
			
 
				+	a1d &= 0x7F;
			
 
				+	l1d |= (l0b & 0x100) >> 1;
			
 
				+	a1d |= (a0b & 0x100) >> 1;
			
 
				+
			
 
				+	int l1de = quant_color(quant_level, l1d);
			
 
				+	int a1de = quant_color(quant_level, a1d);
			
 
				+	int l1du = l1de;
			
 
				+	int a1du = a1de;
			
 
				+
			
 
				+	if ((l1d ^ l1du) & 0xC0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	if ((a1d ^ a1du) & 0xC0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	l1du &= 0x7F;
			
 
				+	a1du &= 0x7F;
			
 
				+
			
 
				+	if (l1du & 0x40)
			
 
				+	{
			
 
				+		l1du -= 0x80;
			
 
				+	}
			
 
				+
			
 
				+	if (a1du & 0x40)
			
 
				+	{
			
 
				+		a1du -= 0x80;
			
 
				+	}
			
 
				+
			
 
				+	l1du += l0b;
			
 
				+	a1du += a0b;
			
 
				+
			
 
				+	if (l1du < 0 || l1du > 0x1FF)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	if (a1du < 0 || a1du > 0x1FF)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	output[0] = static_cast<uint8_t>(l0be);
			
 
				+	output[1] = static_cast<uint8_t>(l1de);
			
 
				+	output[2] = static_cast<uint8_t>(a0be);
			
 
				+	output[3] = static_cast<uint8_t>(a1de);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Try to quantize an LDR RGBA color using delta encoding.
			
 
				+ *
			
 
				+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
			
 
				+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
			
 
				+ * non-negative, then we encode a regular delta.
			
 
				+ *
			
 
				+ * This function only compressed the alpha - the other elements in the output array are not touched.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ *
			
 
				+ * @return Returns @c false on failure, @c true on success.
			
 
				+ */
			
 
				+static bool try_quantize_rgba_delta(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[8],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	return try_quantize_rgb_delta(color0, color1, output, quant_level) &&
			
 
				+	       try_quantize_alpha_delta(color0, color1, output, quant_level);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @brief Try to quantize an LDR RGBA color using delta and blue contract encoding.
			
 
				+ *
			
 
				+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
			
 
				+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
			
 
				+ * non-negative, then we encode a regular delta.
			
 
				+ *
			
 
				+ * This function only compressed the alpha - the other elements in the output array are not touched.
			
 
				+ *
			
 
				+ * @param      color0       The input unquantized color0 endpoint.
			
 
				+ * @param      color1       The input unquantized color1 endpoint.
			
 
				+ * @param[out] output       The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1).
			
 
				+ * @param      quant_level  The quantization level to use.
			
 
				+ *
			
 
				+ * @return Returns @c false on failure, @c true on success.
			
 
				+ */
			
 
				+static bool try_quantize_rgba_delta_blue_contract(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[8],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	// Note that we swap the color0 and color1 ordering for alpha to match RGB blue-contract
			
 
				+	return try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level) &&
			
 
				+	       try_quantize_alpha_delta(color1, color0, output, quant_level);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize an LDR RGB color using scale encoding.
			
 
				+ *
			
 
				+ * @param      color         The input unquantized color endpoint and scale factor.
			
 
				+ * @param[out] output        The output endpoints, returned as (r0, g0, b0, s).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_rgbs(
			
 
				+	vfloat4 color,
			
 
				+	uint8_t output[4],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float r = astc::clamp255f(color.lane<0>() * scale);
			
 
				+	float g = astc::clamp255f(color.lane<1>() * scale);
			
 
				+	float b = astc::clamp255f(color.lane<2>() * scale);
			
 
				+
			
 
				+	int ri = quant_color(quant_level, astc::flt2int_rtn(r));
			
 
				+	int gi = quant_color(quant_level, astc::flt2int_rtn(g));
			
 
				+	int bi = quant_color(quant_level, astc::flt2int_rtn(b));
			
 
				+
			
 
				+	float oldcolorsum = hadd_rgb_s(color) * scale;
			
 
				+	float newcolorsum = static_cast<float>(ri + gi + bi);
			
 
				+
			
 
				+	float scalea = astc::clamp1f(color.lane<3>() * (oldcolorsum + 1e-10f) / (newcolorsum + 1e-10f));
			
 
				+	int scale_idx = astc::flt2int_rtn(scalea * 256.0f);
			
 
				+	scale_idx = astc::clamp(scale_idx, 0, 255);
			
 
				+
			
 
				+	output[0] = static_cast<uint8_t>(ri);
			
 
				+	output[1] = static_cast<uint8_t>(gi);
			
 
				+	output[2] = static_cast<uint8_t>(bi);
			
 
				+	output[3] = quant_color(quant_level, scale_idx);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize an LDR RGBA color using scale encoding.
			
 
				+ *
			
 
				+ * @param      color        The input unquantized color endpoint and scale factor.
			
 
				+ * @param[out] output       The output endpoints, returned as (r0, g0, b0, s, a0, a1).
			
 
				+ * @param      quant_level  The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_rgbs_alpha(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	vfloat4 color,
			
 
				+	uint8_t output[6],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
			
 
				+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
			
 
				+
			
 
				+	output[4] = quant_color(quant_level, astc::flt2int_rtn(a0));
			
 
				+	output[5] = quant_color(quant_level, astc::flt2int_rtn(a1));
			
 
				+
			
 
				+	quantize_rgbs(color, output, quant_level);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a LDR L color.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (l0, l1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_luminance(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[2],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	color0 = color0 * scale;
			
 
				+	color1 = color1 * scale;
			
 
				+
			
 
				+	float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f));
			
 
				+	float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f));
			
 
				+
			
 
				+	if (lum0 > lum1)
			
 
				+	{
			
 
				+		float avg = (lum0 + lum1) * 0.5f;
			
 
				+		lum0 = avg;
			
 
				+		lum1 = avg;
			
 
				+	}
			
 
				+
			
 
				+	output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0));
			
 
				+	output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a LDR LA color.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as (l0, l1, a0, a1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_luminance_alpha(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[4],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	color0 = color0 * scale;
			
 
				+	color1 = color1 * scale;
			
 
				+
			
 
				+	float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f));
			
 
				+	float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f));
			
 
				+
			
 
				+	float a0 = astc::clamp255f(color0.lane<3>());
			
 
				+	float a1 = astc::clamp255f(color1.lane<3>());
			
 
				+
			
 
				+	// If endpoints are close then pull apart slightly; this gives > 8 bit normal map precision.
			
 
				+	if (quant_level > 18)
			
 
				+	{
			
 
				+		if (fabsf(lum0 - lum1) < 3.0f)
			
 
				+		{
			
 
				+			if (lum0 < lum1)
			
 
				+			{
			
 
				+				lum0 -= 0.5f;
			
 
				+				lum1 += 0.5f;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				lum0 += 0.5f;
			
 
				+				lum1 -= 0.5f;
			
 
				+			}
			
 
				+
			
 
				+			lum0 = astc::clamp255f(lum0);
			
 
				+			lum1 = astc::clamp255f(lum1);
			
 
				+		}
			
 
				+
			
 
				+		if (fabsf(a0 - a1) < 3.0f)
			
 
				+		{
			
 
				+			if (a0 < a1)
			
 
				+			{
			
 
				+				a0 -= 0.5f;
			
 
				+				a1 += 0.5f;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				a0 += 0.5f;
			
 
				+				a1 -= 0.5f;
			
 
				+			}
			
 
				+
			
 
				+			a0 = astc::clamp255f(a0);
			
 
				+			a1 = astc::clamp255f(a1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0));
			
 
				+	output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1));
			
 
				+	output[2] = quant_color(quant_level, astc::flt2int_rtn(a0));
			
 
				+	output[3] = quant_color(quant_level, astc::flt2int_rtn(a1));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize and unquantize a value ensuring top two bits are the same.
			
 
				+ *
			
 
				+ * @param      quant_level     The quantization level to use.
			
 
				+ * @param      value           The input unquantized value.
			
 
				+ * @param[out] quant_value     The quantized value.
			
 
				+ */
			
 
				+static inline void quantize_and_unquantize_retain_top_two_bits(
			
 
				+	quant_method quant_level,
			
 
				+	uint8_t value,
			
 
				+	uint8_t& quant_value
			
 
				+) {
			
 
				+	int perform_loop;
			
 
				+	uint8_t quantval;
			
 
				+
			
 
				+	do
			
 
				+	{
			
 
				+		quantval = quant_color(quant_level, value);
			
 
				+
			
 
				+		// Perform looping if the top two bits were modified by quant/unquant
			
 
				+		perform_loop = (value & 0xC0) != (quantval & 0xC0);
			
 
				+
			
 
				+		if ((quantval & 0xC0) > (value & 0xC0))
			
 
				+		{
			
 
				+			// Quant/unquant rounded UP so that the top two bits changed;
			
 
				+			// decrement the input in hopes that this will avoid rounding up.
			
 
				+			value--;
			
 
				+		}
			
 
				+		else if ((quantval & 0xC0) < (value & 0xC0))
			
 
				+		{
			
 
				+			// Quant/unquant rounded DOWN so that the top two bits changed;
			
 
				+			// decrement the input in hopes that this will avoid rounding down.
			
 
				+			value--;
			
 
				+		}
			
 
				+	} while (perform_loop);
			
 
				+
			
 
				+	quant_value = quantval;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize and unquantize a value ensuring top four bits are the same.
			
 
				+ *
			
 
				+ * @param      quant_level     The quantization level to use.
			
 
				+ * @param      value           The input unquantized value.
			
 
				+ * @param[out] quant_value     The quantized value in 0-255 range.
			
 
				+ */
			
 
				+static inline void quantize_and_unquantize_retain_top_four_bits(
			
 
				+	quant_method quant_level,
			
 
				+	uint8_t value,
			
 
				+	uint8_t& quant_value
			
 
				+) {
			
 
				+	uint8_t perform_loop;
			
 
				+	uint8_t quantval;
			
 
				+
			
 
				+	do
			
 
				+	{
			
 
				+		quantval = quant_color(quant_level, value);
			
 
				+		// Perform looping if the top four bits were modified by quant/unquant
			
 
				+		perform_loop = (value & 0xF0) != (quantval & 0xF0);
			
 
				+
			
 
				+		if ((quantval & 0xF0) > (value & 0xF0))
			
 
				+		{
			
 
				+			// Quant/unquant rounded UP so that the top four bits changed;
			
 
				+			// decrement the input value in hopes that this will avoid rounding up.
			
 
				+			value--;
			
 
				+		}
			
 
				+		else if ((quantval & 0xF0) < (value & 0xF0))
			
 
				+		{
			
 
				+			// Quant/unquant rounded DOWN so that the top four bits changed;
			
 
				+			// decrement the input value in hopes that this will avoid rounding down.
			
 
				+			value--;
			
 
				+		}
			
 
				+	} while (perform_loop);
			
 
				+
			
 
				+	quant_value = quantval;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a HDR RGB color using RGB + offset.
			
 
				+ *
			
 
				+ * @param      color         The input unquantized color endpoint and offset.
			
 
				+ * @param[out] output        The output endpoints, returned as packed RGBS with some mode bits.
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_hdr_rgbo(
			
 
				+	vfloat4 color,
			
 
				+	uint8_t output[4],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	color.set_lane<0>(color.lane<0>() + color.lane<3>());
			
 
				+	color.set_lane<1>(color.lane<1>() + color.lane<3>());
			
 
				+	color.set_lane<2>(color.lane<2>() + color.lane<3>());
			
 
				+
			
 
				+	color = clamp(0.0f, 65535.0f, color);
			
 
				+
			
 
				+	vfloat4 color_bak = color;
			
 
				+
			
 
				+	int majcomp;
			
 
				+	if (color.lane<0>() > color.lane<1>() && color.lane<0>() > color.lane<2>())
			
 
				+	{
			
 
				+		majcomp = 0;			// red is largest component
			
 
				+	}
			
 
				+	else if (color.lane<1>() > color.lane<2>())
			
 
				+	{
			
 
				+		majcomp = 1;			// green is largest component
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		majcomp = 2;			// blue is largest component
			
 
				+	}
			
 
				+
			
 
				+	// swap around the red component and the largest component.
			
 
				+	switch (majcomp)
			
 
				+	{
			
 
				+	case 1:
			
 
				+		color = color.swz<1, 0, 2, 3>();
			
 
				+		break;
			
 
				+	case 2:
			
 
				+		color = color.swz<2, 1, 0, 3>();
			
 
				+		break;
			
 
				+	default:
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	static const int mode_bits[5][3] {
			
 
				+		{11, 5, 7},
			
 
				+		{11, 6, 5},
			
 
				+		{10, 5, 8},
			
 
				+		{9, 6, 7},
			
 
				+		{8, 7, 6}
			
 
				+	};
			
 
				+
			
 
				+	static const float mode_cutoffs[5][2] {
			
 
				+		{1024, 4096},
			
 
				+		{2048, 1024},
			
 
				+		{2048, 16384},
			
 
				+		{8192, 16384},
			
 
				+		{32768, 16384}
			
 
				+	};
			
 
				+
			
 
				+	static const float mode_rscales[5] {
			
 
				+		32.0f,
			
 
				+		32.0f,
			
 
				+		64.0f,
			
 
				+		128.0f,
			
 
				+		256.0f,
			
 
				+	};
			
 
				+
			
 
				+	static const float mode_scales[5] {
			
 
				+		1.0f / 32.0f,
			
 
				+		1.0f / 32.0f,
			
 
				+		1.0f / 64.0f,
			
 
				+		1.0f / 128.0f,
			
 
				+		1.0f / 256.0f,
			
 
				+	};
			
 
				+
			
 
				+	float r_base = color.lane<0>();
			
 
				+	float g_base = color.lane<0>() - color.lane<1>() ;
			
 
				+	float b_base = color.lane<0>() - color.lane<2>() ;
			
 
				+	float s_base = color.lane<3>() ;
			
 
				+
			
 
				+	for (int mode = 0; mode < 5; mode++)
			
 
				+	{
			
 
				+		if (g_base > mode_cutoffs[mode][0] || b_base > mode_cutoffs[mode][0] || s_base > mode_cutoffs[mode][1])
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		// Encode the mode into a 4-bit vector
			
 
				+		int mode_enc = mode < 4 ? (mode | (majcomp << 2)) : (majcomp | 0xC);
			
 
				+
			
 
				+		float mode_scale = mode_scales[mode];
			
 
				+		float mode_rscale = mode_rscales[mode];
			
 
				+
			
 
				+		int gb_intcutoff = 1 << mode_bits[mode][1];
			
 
				+		int s_intcutoff = 1 << mode_bits[mode][2];
			
 
				+
			
 
				+		// Quantize and unquantize R
			
 
				+		int r_intval = astc::flt2int_rtn(r_base * mode_scale);
			
 
				+
			
 
				+		int r_lowbits = r_intval & 0x3f;
			
 
				+
			
 
				+		r_lowbits |= (mode_enc & 3) << 6;
			
 
				+
			
 
				+		uint8_t r_quantval;
			
 
				+		quantize_and_unquantize_retain_top_two_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(r_lowbits), r_quantval);
			
 
				+
			
 
				+		r_intval = (r_intval & ~0x3f) | (r_quantval & 0x3f);
			
 
				+		float r_fval = static_cast<float>(r_intval) * mode_rscale;
			
 
				+
			
 
				+		// Recompute G and B, then quantize and unquantize them
			
 
				+		float g_fval = r_fval - color.lane<1>() ;
			
 
				+		float b_fval = r_fval - color.lane<2>() ;
			
 
				+
			
 
				+		g_fval = astc::clamp(g_fval, 0.0f, 65535.0f);
			
 
				+		b_fval = astc::clamp(b_fval, 0.0f, 65535.0f);
			
 
				+
			
 
				+		int g_intval = astc::flt2int_rtn(g_fval * mode_scale);
			
 
				+		int b_intval = astc::flt2int_rtn(b_fval * mode_scale);
			
 
				+
			
 
				+		if (g_intval >= gb_intcutoff || b_intval >= gb_intcutoff)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		int g_lowbits = g_intval & 0x1f;
			
 
				+		int b_lowbits = b_intval & 0x1f;
			
 
				+
			
 
				+		int bit0 = 0;
			
 
				+		int bit1 = 0;
			
 
				+		int bit2 = 0;
			
 
				+		int bit3 = 0;
			
 
				+
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 0:
			
 
				+		case 2:
			
 
				+			bit0 = (r_intval >> 9) & 1;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+		case 3:
			
 
				+			bit0 = (r_intval >> 8) & 1;
			
 
				+			break;
			
 
				+		case 4:
			
 
				+		case 5:
			
 
				+			bit0 = (g_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 0:
			
 
				+		case 1:
			
 
				+		case 2:
			
 
				+		case 3:
			
 
				+			bit2 = (r_intval >> 7) & 1;
			
 
				+			break;
			
 
				+		case 4:
			
 
				+		case 5:
			
 
				+			bit2 = (b_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 0:
			
 
				+		case 2:
			
 
				+			bit1 = (r_intval >> 8) & 1;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+		case 3:
			
 
				+		case 4:
			
 
				+		case 5:
			
 
				+			bit1 = (g_intval >> 5) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 0:
			
 
				+			bit3 = (r_intval >> 10) & 1;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+			bit3 = (r_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+		case 3:
			
 
				+		case 4:
			
 
				+		case 5:
			
 
				+			bit3 = (b_intval >> 5) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		g_lowbits |= (mode_enc & 0x4) << 5;
			
 
				+		b_lowbits |= (mode_enc & 0x8) << 4;
			
 
				+
			
 
				+		g_lowbits |= bit0 << 6;
			
 
				+		g_lowbits |= bit1 << 5;
			
 
				+		b_lowbits |= bit2 << 6;
			
 
				+		b_lowbits |= bit3 << 5;
			
 
				+
			
 
				+		uint8_t g_quantval;
			
 
				+		uint8_t b_quantval;
			
 
				+
			
 
				+		quantize_and_unquantize_retain_top_four_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(g_lowbits), g_quantval);
			
 
				+		quantize_and_unquantize_retain_top_four_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(b_lowbits), b_quantval);
			
 
				+
			
 
				+		g_intval = (g_intval & ~0x1f) | (g_quantval & 0x1f);
			
 
				+		b_intval = (b_intval & ~0x1f) | (b_quantval & 0x1f);
			
 
				+
			
 
				+		g_fval = static_cast<float>(g_intval) * mode_rscale;
			
 
				+		b_fval = static_cast<float>(b_intval) * mode_rscale;
			
 
				+
			
 
				+		// Recompute the scale value, based on the errors introduced to red, green and blue
			
 
				+
			
 
				+		// If the error is positive, then the R,G,B errors combined have raised the color
			
 
				+		// value overall; as such, the scale value needs to be increased.
			
 
				+		float rgb_errorsum = (r_fval - color.lane<0>() ) + (r_fval - g_fval - color.lane<1>() ) + (r_fval - b_fval - color.lane<2>() );
			
 
				+
			
 
				+		float s_fval = s_base + rgb_errorsum * (1.0f / 3.0f);
			
 
				+		s_fval = astc::clamp(s_fval, 0.0f, 1e9f);
			
 
				+
			
 
				+		int s_intval = astc::flt2int_rtn(s_fval * mode_scale);
			
 
				+
			
 
				+		if (s_intval >= s_intcutoff)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		int s_lowbits = s_intval & 0x1f;
			
 
				+
			
 
				+		int bit4;
			
 
				+		int bit5;
			
 
				+		int bit6;
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 1:
			
 
				+			bit6 = (r_intval >> 9) & 1;
			
 
				+			break;
			
 
				+		default:
			
 
				+			bit6 = (s_intval >> 5) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 4:
			
 
				+			bit5 = (r_intval >> 7) & 1;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+			bit5 = (r_intval >> 10) & 1;
			
 
				+			break;
			
 
				+		default:
			
 
				+			bit5 = (s_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 2:
			
 
				+			bit4 = (s_intval >> 7) & 1;
			
 
				+			break;
			
 
				+		default:
			
 
				+			bit4 = (r_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		s_lowbits |= bit6 << 5;
			
 
				+		s_lowbits |= bit5 << 6;
			
 
				+		s_lowbits |= bit4 << 7;
			
 
				+
			
 
				+		uint8_t s_quantval;
			
 
				+
			
 
				+		quantize_and_unquantize_retain_top_four_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(s_lowbits), s_quantval);
			
 
				+
			
 
				+		output[0] = r_quantval;
			
 
				+		output[1] = g_quantval;
			
 
				+		output[2] = b_quantval;
			
 
				+		output[3] = s_quantval;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Failed to encode any of the modes above? In that case encode using mode #5
			
 
				+	float vals[4];
			
 
				+	vals[0] = color_bak.lane<0>();
			
 
				+	vals[1] = color_bak.lane<1>();
			
 
				+	vals[2] = color_bak.lane<2>();
			
 
				+	vals[3] = color_bak.lane<3>();
			
 
				+
			
 
				+	int ivals[4];
			
 
				+	float cvals[3];
			
 
				+
			
 
				+	for (int i = 0; i < 3; i++)
			
 
				+	{
			
 
				+		vals[i] = astc::clamp(vals[i], 0.0f, 65020.0f);
			
 
				+		ivals[i] = astc::flt2int_rtn(vals[i] * (1.0f / 512.0f));
			
 
				+		cvals[i] = static_cast<float>(ivals[i]) * 512.0f;
			
 
				+	}
			
 
				+
			
 
				+	float rgb_errorsum = (cvals[0] - vals[0]) + (cvals[1] - vals[1]) + (cvals[2] - vals[2]);
			
 
				+	vals[3] += rgb_errorsum * (1.0f / 3.0f);
			
 
				+
			
 
				+	vals[3] = astc::clamp(vals[3], 0.0f, 65020.0f);
			
 
				+	ivals[3] = astc::flt2int_rtn(vals[3] * (1.0f / 512.0f));
			
 
				+
			
 
				+	int encvals[4];
			
 
				+	encvals[0] = (ivals[0] & 0x3f) | 0xC0;
			
 
				+	encvals[1] = (ivals[1] & 0x7f) | 0x80;
			
 
				+	encvals[2] = (ivals[2] & 0x7f) | 0x80;
			
 
				+	encvals[3] = (ivals[3] & 0x7f) | ((ivals[0] & 0x40) << 1);
			
 
				+
			
 
				+	for (uint8_t i = 0; i < 4; i++)
			
 
				+	{
			
 
				+		quantize_and_unquantize_retain_top_four_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(encvals[i]), output[i]);
			
 
				+	}
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a HDR RGB color using direct RGB encoding.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as packed RGB+RGB pairs with mode bits.
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_hdr_rgb(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[6],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	// Note: color*.lane<3> is not used so we can ignore it
			
 
				+	color0 = clamp(0.0f, 65535.0f, color0);
			
 
				+	color1 = clamp(0.0f, 65535.0f, color1);
			
 
				+
			
 
				+	vfloat4 color0_bak = color0;
			
 
				+	vfloat4 color1_bak = color1;
			
 
				+
			
 
				+	int majcomp;
			
 
				+	if (color1.lane<0>() > color1.lane<1>() && color1.lane<0>() > color1.lane<2>())
			
 
				+	{
			
 
				+		majcomp = 0;
			
 
				+	}
			
 
				+	else if (color1.lane<1>() > color1.lane<2>())
			
 
				+	{
			
 
				+		majcomp = 1;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		majcomp = 2;
			
 
				+	}
			
 
				+
			
 
				+	// Swizzle the components
			
 
				+	switch (majcomp)
			
 
				+	{
			
 
				+	case 1:  // red-green swap
			
 
				+		color0 = color0.swz<1, 0, 2, 3>();
			
 
				+		color1 = color1.swz<1, 0, 2, 3>();
			
 
				+		break;
			
 
				+	case 2:  // red-blue swap
			
 
				+		color0 = color0.swz<2, 1, 0, 3>();
			
 
				+		color1 = color1.swz<2, 1, 0, 3>();
			
 
				+		break;
			
 
				+	default:
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	float a_base = color1.lane<0>();
			
 
				+	a_base = astc::clamp(a_base, 0.0f, 65535.0f);
			
 
				+
			
 
				+	float b0_base = a_base - color1.lane<1>();
			
 
				+	float b1_base = a_base - color1.lane<2>();
			
 
				+	float c_base = a_base - color0.lane<0>();
			
 
				+	float d0_base = a_base - b0_base - c_base - color0.lane<1>();
			
 
				+	float d1_base = a_base - b1_base - c_base - color0.lane<2>();
			
 
				+
			
 
				+	// Number of bits in the various fields in the various modes
			
 
				+	static const int mode_bits[8][4] {
			
 
				+		{9, 7, 6, 7},
			
 
				+		{9, 8, 6, 6},
			
 
				+		{10, 6, 7, 7},
			
 
				+		{10, 7, 7, 6},
			
 
				+		{11, 8, 6, 5},
			
 
				+		{11, 6, 8, 6},
			
 
				+		{12, 7, 7, 5},
			
 
				+		{12, 6, 7, 6}
			
 
				+	};
			
 
				+
			
 
				+	// Cutoffs to use for the computed values of a,b,c,d, assuming the
			
 
				+	// range 0..65535 are LNS values corresponding to fp16.
			
 
				+	static const float mode_cutoffs[8][4] {
			
 
				+		{16384, 8192, 8192, 8},	// mode 0: 9,7,6,7
			
 
				+		{32768, 8192, 4096, 8},	// mode 1: 9,8,6,6
			
 
				+		{4096, 8192, 4096, 4},	// mode 2: 10,6,7,7
			
 
				+		{8192, 8192, 2048, 4},	// mode 3: 10,7,7,6
			
 
				+		{8192, 2048, 512, 2},	// mode 4: 11,8,6,5
			
 
				+		{2048, 8192, 1024, 2},	// mode 5: 11,6,8,6
			
 
				+		{2048, 2048, 256, 1},	// mode 6: 12,7,7,5
			
 
				+		{1024, 2048, 512, 1},	// mode 7: 12,6,7,6
			
 
				+	};
			
 
				+
			
 
				+	static const float mode_scales[8] {
			
 
				+		1.0f / 128.0f,
			
 
				+		1.0f / 128.0f,
			
 
				+		1.0f / 64.0f,
			
 
				+		1.0f / 64.0f,
			
 
				+		1.0f / 32.0f,
			
 
				+		1.0f / 32.0f,
			
 
				+		1.0f / 16.0f,
			
 
				+		1.0f / 16.0f,
			
 
				+	};
			
 
				+
			
 
				+	// Scaling factors when going from what was encoded in the mode to 16 bits.
			
 
				+	static const float mode_rscales[8] {
			
 
				+		128.0f,
			
 
				+		128.0f,
			
 
				+		64.0f,
			
 
				+		64.0f,
			
 
				+		32.0f,
			
 
				+		32.0f,
			
 
				+		16.0f,
			
 
				+		16.0f
			
 
				+	};
			
 
				+
			
 
				+	// Try modes one by one, with the highest-precision mode first.
			
 
				+	for (int mode = 7; mode >= 0; mode--)
			
 
				+	{
			
 
				+		// For each mode, test if we can in fact accommodate the computed b, c, and d values.
			
 
				+		// If we clearly can't, then we skip to the next mode.
			
 
				+
			
 
				+		float b_cutoff = mode_cutoffs[mode][0];
			
 
				+		float c_cutoff = mode_cutoffs[mode][1];
			
 
				+		float d_cutoff = mode_cutoffs[mode][2];
			
 
				+
			
 
				+		if (b0_base > b_cutoff || b1_base > b_cutoff || c_base > c_cutoff || fabsf(d0_base) > d_cutoff || fabsf(d1_base) > d_cutoff)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		float mode_scale = mode_scales[mode];
			
 
				+		float mode_rscale = mode_rscales[mode];
			
 
				+
			
 
				+		int b_intcutoff = 1 << mode_bits[mode][1];
			
 
				+		int c_intcutoff = 1 << mode_bits[mode][2];
			
 
				+		int d_intcutoff = 1 << (mode_bits[mode][3] - 1);
			
 
				+
			
 
				+		// Quantize and unquantize A, with the assumption that its high bits can be handled safely.
			
 
				+		int a_intval = astc::flt2int_rtn(a_base * mode_scale);
			
 
				+		int a_lowbits = a_intval & 0xFF;
			
 
				+
			
 
				+		int a_quantval = quant_color(quant_level, a_lowbits);
			
 
				+		int a_uquantval = a_quantval;
			
 
				+		a_intval = (a_intval & ~0xFF) | a_uquantval;
			
 
				+		float a_fval = static_cast<float>(a_intval) * mode_rscale;
			
 
				+
			
 
				+		// Recompute C, then quantize and unquantize it
			
 
				+		float c_fval = a_fval - color0.lane<0>();
			
 
				+		c_fval = astc::clamp(c_fval, 0.0f, 65535.0f);
			
 
				+
			
 
				+		int c_intval = astc::flt2int_rtn(c_fval * mode_scale);
			
 
				+
			
 
				+		if (c_intval >= c_intcutoff)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		int c_lowbits = c_intval & 0x3f;
			
 
				+
			
 
				+		c_lowbits |= (mode & 1) << 7;
			
 
				+		c_lowbits |= (a_intval & 0x100) >> 2;
			
 
				+
			
 
				+		uint8_t c_quantval;
			
 
				+
			
 
				+		quantize_and_unquantize_retain_top_two_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(c_lowbits), c_quantval);
			
 
				+
			
 
				+		c_intval = (c_intval & ~0x3F) | (c_quantval & 0x3F);
			
 
				+		c_fval = static_cast<float>(c_intval) * mode_rscale;
			
 
				+
			
 
				+		// Recompute B0 and B1, then quantize and unquantize them
			
 
				+		float b0_fval = a_fval - color1.lane<1>();
			
 
				+		float b1_fval = a_fval - color1.lane<2>();
			
 
				+
			
 
				+		b0_fval = astc::clamp(b0_fval, 0.0f, 65535.0f);
			
 
				+		b1_fval = astc::clamp(b1_fval, 0.0f, 65535.0f);
			
 
				+		int b0_intval = astc::flt2int_rtn(b0_fval * mode_scale);
			
 
				+		int b1_intval = astc::flt2int_rtn(b1_fval * mode_scale);
			
 
				+
			
 
				+		if (b0_intval >= b_intcutoff || b1_intval >= b_intcutoff)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		int b0_lowbits = b0_intval & 0x3f;
			
 
				+		int b1_lowbits = b1_intval & 0x3f;
			
 
				+
			
 
				+		int bit0 = 0;
			
 
				+		int bit1 = 0;
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 0:
			
 
				+		case 1:
			
 
				+		case 3:
			
 
				+		case 4:
			
 
				+		case 6:
			
 
				+			bit0 = (b0_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+		case 5:
			
 
				+		case 7:
			
 
				+			bit0 = (a_intval >> 9) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 0:
			
 
				+		case 1:
			
 
				+		case 3:
			
 
				+		case 4:
			
 
				+		case 6:
			
 
				+			bit1 = (b1_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+			bit1 = (c_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		case 5:
			
 
				+		case 7:
			
 
				+			bit1 = (a_intval >> 10) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		b0_lowbits |= bit0 << 6;
			
 
				+		b1_lowbits |= bit1 << 6;
			
 
				+
			
 
				+		b0_lowbits |= ((mode >> 1) & 1) << 7;
			
 
				+		b1_lowbits |= ((mode >> 2) & 1) << 7;
			
 
				+
			
 
				+		uint8_t b0_quantval;
			
 
				+		uint8_t b1_quantval;
			
 
				+
			
 
				+		quantize_and_unquantize_retain_top_two_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(b0_lowbits), b0_quantval);
			
 
				+		quantize_and_unquantize_retain_top_two_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(b1_lowbits), b1_quantval);
			
 
				+
			
 
				+		b0_intval = (b0_intval & ~0x3f) | (b0_quantval & 0x3f);
			
 
				+		b1_intval = (b1_intval & ~0x3f) | (b1_quantval & 0x3f);
			
 
				+		b0_fval = static_cast<float>(b0_intval) * mode_rscale;
			
 
				+		b1_fval = static_cast<float>(b1_intval) * mode_rscale;
			
 
				+
			
 
				+		// Recompute D0 and D1, then quantize and unquantize them
			
 
				+		float d0_fval = a_fval - b0_fval - c_fval - color0.lane<1>();
			
 
				+		float d1_fval = a_fval - b1_fval - c_fval - color0.lane<2>();
			
 
				+
			
 
				+		d0_fval = astc::clamp(d0_fval, -65535.0f, 65535.0f);
			
 
				+		d1_fval = astc::clamp(d1_fval, -65535.0f, 65535.0f);
			
 
				+
			
 
				+		int d0_intval = astc::flt2int_rtn(d0_fval * mode_scale);
			
 
				+		int d1_intval = astc::flt2int_rtn(d1_fval * mode_scale);
			
 
				+
			
 
				+		if (abs(d0_intval) >= d_intcutoff || abs(d1_intval) >= d_intcutoff)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		int d0_lowbits = d0_intval & 0x1f;
			
 
				+		int d1_lowbits = d1_intval & 0x1f;
			
 
				+
			
 
				+		int bit2 = 0;
			
 
				+		int bit3 = 0;
			
 
				+		int bit4;
			
 
				+		int bit5;
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 0:
			
 
				+		case 2:
			
 
				+			bit2 = (d0_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+		case 4:
			
 
				+			bit2 = (b0_intval >> 7) & 1;
			
 
				+			break;
			
 
				+		case 3:
			
 
				+			bit2 = (a_intval >> 9) & 1;
			
 
				+			break;
			
 
				+		case 5:
			
 
				+			bit2 = (c_intval >> 7) & 1;
			
 
				+			break;
			
 
				+		case 6:
			
 
				+		case 7:
			
 
				+			bit2 = (a_intval >> 11) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 0:
			
 
				+		case 2:
			
 
				+			bit3 = (d1_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+		case 4:
			
 
				+			bit3 = (b1_intval >> 7) & 1;
			
 
				+			break;
			
 
				+		case 3:
			
 
				+		case 5:
			
 
				+		case 6:
			
 
				+		case 7:
			
 
				+			bit3 = (c_intval >> 6) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		switch (mode)
			
 
				+		{
			
 
				+		case 4:
			
 
				+		case 6:
			
 
				+			bit4 = (a_intval >> 9) & 1;
			
 
				+			bit5 = (a_intval >> 10) & 1;
			
 
				+			break;
			
 
				+		default:
			
 
				+			bit4 = (d0_intval >> 5) & 1;
			
 
				+			bit5 = (d1_intval >> 5) & 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		d0_lowbits |= bit2 << 6;
			
 
				+		d1_lowbits |= bit3 << 6;
			
 
				+		d0_lowbits |= bit4 << 5;
			
 
				+		d1_lowbits |= bit5 << 5;
			
 
				+
			
 
				+		d0_lowbits |= (majcomp & 1) << 7;
			
 
				+		d1_lowbits |= ((majcomp >> 1) & 1) << 7;
			
 
				+
			
 
				+		uint8_t d0_quantval;
			
 
				+		uint8_t d1_quantval;
			
 
				+
			
 
				+		quantize_and_unquantize_retain_top_four_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(d0_lowbits), d0_quantval);
			
 
				+		quantize_and_unquantize_retain_top_four_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(d1_lowbits), d1_quantval);
			
 
				+
			
 
				+		output[0] = static_cast<uint8_t>(a_quantval);
			
 
				+		output[1] = c_quantval;
			
 
				+		output[2] = b0_quantval;
			
 
				+		output[3] = b1_quantval;
			
 
				+		output[4] = d0_quantval;
			
 
				+		output[5] = d1_quantval;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// If neither of the modes fit we will use a flat representation for storing data, using 8 bits
			
 
				+	// for red and green, and 7 bits for blue. This gives color accuracy roughly similar to LDR
			
 
				+	// 4:4:3 which is not at all great but usable. This representation is used if the light color is
			
 
				+	// more than 4x the color value of the dark color.
			
 
				+	float vals[6];
			
 
				+	vals[0] = color0_bak.lane<0>();
			
 
				+	vals[1] = color1_bak.lane<0>();
			
 
				+	vals[2] = color0_bak.lane<1>();
			
 
				+	vals[3] = color1_bak.lane<1>();
			
 
				+	vals[4] = color0_bak.lane<2>();
			
 
				+	vals[5] = color1_bak.lane<2>();
			
 
				+
			
 
				+	for (int i = 0; i < 6; i++)
			
 
				+	{
			
 
				+		vals[i] = astc::clamp(vals[i], 0.0f, 65020.0f);
			
 
				+	}
			
 
				+
			
 
				+	for (int i = 0; i < 4; i++)
			
 
				+	{
			
 
				+		int idx = astc::flt2int_rtn(vals[i] * 1.0f / 256.0f);
			
 
				+		output[i] = quant_color(quant_level, idx);
			
 
				+	}
			
 
				+
			
 
				+	for (int i = 4; i < 6; i++)
			
 
				+	{
			
 
				+		int idx = astc::flt2int_rtn(vals[i] * 1.0f / 512.0f) + 128;
			
 
				+		quantize_and_unquantize_retain_top_two_bits(
			
 
				+		    quant_level, static_cast<uint8_t>(idx), output[i]);
			
 
				+	}
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a HDR RGB + LDR A color using direct RGBA encoding.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as packed RGBA+RGBA pairs with mode bits.
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_hdr_rgb_ldr_alpha(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[8],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float scale = 1.0f / 257.0f;
			
 
				+
			
 
				+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
			
 
				+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
			
 
				+
			
 
				+	output[6] = quant_color(quant_level, astc::flt2int_rtn(a0));
			
 
				+	output[7] = quant_color(quant_level, astc::flt2int_rtn(a1));
			
 
				+
			
 
				+	quantize_hdr_rgb(color0, color1, output, quant_level);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a HDR L color using the large range encoding.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as packed (l0, l1).
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_hdr_luminance_large_range(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[2],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
			
 
				+	float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
			
 
				+
			
 
				+	if (lum1 < lum0)
			
 
				+	{
			
 
				+		float avg = (lum0 + lum1) * 0.5f;
			
 
				+		lum0 = avg;
			
 
				+		lum1 = avg;
			
 
				+	}
			
 
				+
			
 
				+	int ilum1 = astc::flt2int_rtn(lum1);
			
 
				+	int ilum0 = astc::flt2int_rtn(lum0);
			
 
				+
			
 
				+	// Find the closest encodable point in the upper half of the code-point space
			
 
				+	int upper_v0 = (ilum0 + 128) >> 8;
			
 
				+	int upper_v1 = (ilum1 + 128) >> 8;
			
 
				+
			
 
				+	upper_v0 = astc::clamp(upper_v0, 0, 255);
			
 
				+	upper_v1 = astc::clamp(upper_v1, 0, 255);
			
 
				+
			
 
				+	// Find the closest encodable point in the lower half of the code-point space
			
 
				+	int lower_v0 = (ilum1 + 256) >> 8;
			
 
				+	int lower_v1 = ilum0 >> 8;
			
 
				+
			
 
				+	lower_v0 = astc::clamp(lower_v0, 0, 255);
			
 
				+	lower_v1 = astc::clamp(lower_v1, 0, 255);
			
 
				+
			
 
				+	// Determine the distance between the point in code-point space and the input value
			
 
				+	int upper0_dec = upper_v0 << 8;
			
 
				+	int upper1_dec = upper_v1 << 8;
			
 
				+	int lower0_dec = (lower_v1 << 8) + 128;
			
 
				+	int lower1_dec = (lower_v0 << 8) - 128;
			
 
				+
			
 
				+	int upper0_diff = upper0_dec - ilum0;
			
 
				+	int upper1_diff = upper1_dec - ilum1;
			
 
				+	int lower0_diff = lower0_dec - ilum0;
			
 
				+	int lower1_diff = lower1_dec - ilum1;
			
 
				+
			
 
				+	int upper_error = (upper0_diff * upper0_diff) + (upper1_diff * upper1_diff);
			
 
				+	int lower_error = (lower0_diff * lower0_diff) + (lower1_diff * lower1_diff);
			
 
				+
			
 
				+	int v0, v1;
			
 
				+	if (upper_error < lower_error)
			
 
				+	{
			
 
				+		v0 = upper_v0;
			
 
				+		v1 = upper_v1;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		v0 = lower_v0;
			
 
				+		v1 = lower_v1;
			
 
				+	}
			
 
				+
			
 
				+	// OK; encode
			
 
				+	output[0] = quant_color(quant_level, v0);
			
 
				+	output[1] = quant_color(quant_level, v1);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a HDR L color using the small range encoding.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as packed (l0, l1) with mode bits.
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ *
			
 
				+ * @return Returns @c false on failure, @c true on success.
			
 
				+ */
			
 
				+static bool try_quantize_hdr_luminance_small_range(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[2],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
			
 
				+	float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
			
 
				+
			
 
				+	if (lum1 < lum0)
			
 
				+	{
			
 
				+		float avg = (lum0 + lum1) * 0.5f;
			
 
				+		lum0 = avg;
			
 
				+		lum1 = avg;
			
 
				+	}
			
 
				+
			
 
				+	int ilum1 = astc::flt2int_rtn(lum1);
			
 
				+	int ilum0 = astc::flt2int_rtn(lum0);
			
 
				+
			
 
				+	// Difference of more than a factor-of-2 results in immediate failure
			
 
				+	if (ilum1 - ilum0 > 2048)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	int lowval, highval, diffval;
			
 
				+	int v0, v1;
			
 
				+	int v0e, v1e;
			
 
				+	int v0d, v1d;
			
 
				+
			
 
				+	// Try to encode the high-precision submode
			
 
				+	lowval = (ilum0 + 16) >> 5;
			
 
				+	highval = (ilum1 + 16) >> 5;
			
 
				+
			
 
				+	lowval = astc::clamp(lowval, 0, 2047);
			
 
				+	highval = astc::clamp(highval, 0, 2047);
			
 
				+
			
 
				+	v0 = lowval & 0x7F;
			
 
				+	v0e = quant_color(quant_level, v0);
			
 
				+	v0d = v0e;
			
 
				+
			
 
				+	if (v0d < 0x80)
			
 
				+	{
			
 
				+		lowval = (lowval & ~0x7F) | v0d;
			
 
				+		diffval = highval - lowval;
			
 
				+		if (diffval >= 0 && diffval <= 15)
			
 
				+		{
			
 
				+			v1 = ((lowval >> 3) & 0xF0) | diffval;
			
 
				+			v1e = quant_color(quant_level, v1);
			
 
				+			v1d = v1e;
			
 
				+			if ((v1d & 0xF0) == (v1 & 0xF0))
			
 
				+			{
			
 
				+				output[0] = static_cast<uint8_t>(v0e);
			
 
				+				output[1] = static_cast<uint8_t>(v1e);
			
 
				+				return true;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Try to encode the low-precision submode
			
 
				+	lowval = (ilum0 + 32) >> 6;
			
 
				+	highval = (ilum1 + 32) >> 6;
			
 
				+
			
 
				+	lowval = astc::clamp(lowval, 0, 1023);
			
 
				+	highval = astc::clamp(highval, 0, 1023);
			
 
				+
			
 
				+	v0 = (lowval & 0x7F) | 0x80;
			
 
				+	v0e = quant_color(quant_level, v0);
			
 
				+	v0d = v0e;
			
 
				+	if ((v0d & 0x80) == 0)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	lowval = (lowval & ~0x7F) | (v0d & 0x7F);
			
 
				+	diffval = highval - lowval;
			
 
				+	if (diffval < 0 || diffval > 31)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	v1 = ((lowval >> 2) & 0xE0) | diffval;
			
 
				+	v1e = quant_color(quant_level, v1);
			
 
				+	v1d = v1e;
			
 
				+	if ((v1d & 0xE0) != (v1 & 0xE0))
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	output[0] = static_cast<uint8_t>(v0e);
			
 
				+	output[1] = static_cast<uint8_t>(v1e);
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a HDR A color using either delta or direct RGBA encoding.
			
 
				+ *
			
 
				+ * @param      alpha0        The input unquantized color0 endpoint.
			
 
				+ * @param      alpha1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as packed RGBA+RGBA pairs with mode bits.
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_hdr_alpha(
			
 
				+	float alpha0,
			
 
				+	float alpha1,
			
 
				+	uint8_t output[2],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	alpha0 = astc::clamp(alpha0, 0.0f, 65280.0f);
			
 
				+	alpha1 = astc::clamp(alpha1, 0.0f, 65280.0f);
			
 
				+
			
 
				+	int ialpha0 = astc::flt2int_rtn(alpha0);
			
 
				+	int ialpha1 = astc::flt2int_rtn(alpha1);
			
 
				+
			
 
				+	int val0, val1, diffval;
			
 
				+	int v6, v7;
			
 
				+	int v6e, v7e;
			
 
				+	int v6d, v7d;
			
 
				+
			
 
				+	// Try to encode one of the delta submodes, in decreasing-precision order
			
 
				+	for (int i = 2; i >= 0; i--)
			
 
				+	{
			
 
				+		val0 = (ialpha0 + (128 >> i)) >> (8 - i);
			
 
				+		val1 = (ialpha1 + (128 >> i)) >> (8 - i);
			
 
				+
			
 
				+		v6 = (val0 & 0x7F) | ((i & 1) << 7);
			
 
				+		v6e = quant_color(quant_level, v6);
			
 
				+		v6d = v6e;
			
 
				+
			
 
				+		if ((v6 ^ v6d) & 0x80)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		val0 = (val0 & ~0x7f) | (v6d & 0x7f);
			
 
				+		diffval = val1 - val0;
			
 
				+		int cutoff = 32 >> i;
			
 
				+		int mask = 2 * cutoff - 1;
			
 
				+
			
 
				+		if (diffval < -cutoff || diffval >= cutoff)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		v7 = ((i & 2) << 6) | ((val0 >> 7) << (6 - i)) | (diffval & mask);
			
 
				+		v7e = quant_color(quant_level, v7);
			
 
				+		v7d = v7e;
			
 
				+
			
 
				+		static const int testbits[3] { 0xE0, 0xF0, 0xF8 };
			
 
				+
			
 
				+		if ((v7 ^ v7d) & testbits[i])
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		output[0] = static_cast<uint8_t>(v6e);
			
 
				+		output[1] = static_cast<uint8_t>(v7e);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Could not encode any of the delta modes; instead encode a flat value
			
 
				+	val0 = (ialpha0 + 256) >> 9;
			
 
				+	val1 = (ialpha1 + 256) >> 9;
			
 
				+	v6 = val0 | 0x80;
			
 
				+	v7 = val1 | 0x80;
			
 
				+
			
 
				+	output[0] = quant_color(quant_level, v6);
			
 
				+	output[1] = quant_color(quant_level, v7);
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Quantize a HDR RGBA color using either delta or direct RGBA encoding.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint.
			
 
				+ * @param      color1        The input unquantized color1 endpoint.
			
 
				+ * @param[out] output        The output endpoints, returned as packed RGBA+RGBA pairs with mode bits.
			
 
				+ * @param      quant_level   The quantization level to use.
			
 
				+ */
			
 
				+static void quantize_hdr_rgb_alpha(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	uint8_t output[8],
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	quantize_hdr_rgb(color0, color1, output, quant_level);
			
 
				+	quantize_hdr_alpha(color0.lane<3>(), color1.lane<3>(), output + 6, quant_level);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+uint8_t pack_color_endpoints(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	vfloat4 rgbs_color,
			
 
				+	vfloat4 rgbo_color,
			
 
				+	int format,
			
 
				+	uint8_t* output,
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	assert(QUANT_6 <= quant_level && quant_level <= QUANT_256);
			
 
				+
			
 
				+	// We do not support negative colors
			
 
				+	color0 = max(color0, 0.0f);
			
 
				+	color1 = max(color1, 0.0f);
			
 
				+
			
 
				+	uint8_t retval = 0;
			
 
				+
			
 
				+	switch (format)
			
 
				+	{
			
 
				+	case FMT_RGB:
			
 
				+		if (quant_level <= QUANT_160)
			
 
				+		{
			
 
				+			if (try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level))
			
 
				+			{
			
 
				+				retval = FMT_RGB_DELTA;
			
 
				+				break;
			
 
				+			}
			
 
				+			if (try_quantize_rgb_delta(color0, color1, output, quant_level))
			
 
				+			{
			
 
				+				retval = FMT_RGB_DELTA;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		if (quant_level < QUANT_256 && try_quantize_rgb_blue_contract(color0, color1, output, quant_level))
			
 
				+		{
			
 
				+			retval = FMT_RGB;
			
 
				+			break;
			
 
				+		}
			
 
				+		quantize_rgb(color0, color1, output, quant_level);
			
 
				+		retval = FMT_RGB;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGBA:
			
 
				+		if (quant_level <= QUANT_160)
			
 
				+		{
			
 
				+			if (try_quantize_rgba_delta_blue_contract(color0, color1, output, quant_level))
			
 
				+			{
			
 
				+				retval = FMT_RGBA_DELTA;
			
 
				+				break;
			
 
				+			}
			
 
				+			if (try_quantize_rgba_delta(color0, color1, output, quant_level))
			
 
				+			{
			
 
				+				retval = FMT_RGBA_DELTA;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		if (quant_level < QUANT_256 && try_quantize_rgba_blue_contract(color0, color1, output, quant_level))
			
 
				+		{
			
 
				+			retval = FMT_RGBA;
			
 
				+			break;
			
 
				+		}
			
 
				+		quantize_rgba(color0, color1, output, quant_level);
			
 
				+		retval = FMT_RGBA;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGB_SCALE:
			
 
				+		quantize_rgbs(rgbs_color, output, quant_level);
			
 
				+		retval = FMT_RGB_SCALE;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_RGB_SCALE:
			
 
				+		quantize_hdr_rgbo(rgbo_color, output, quant_level);
			
 
				+		retval = FMT_HDR_RGB_SCALE;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_RGB:
			
 
				+		quantize_hdr_rgb(color0, color1, output, quant_level);
			
 
				+		retval = FMT_HDR_RGB;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGB_SCALE_ALPHA:
			
 
				+		quantize_rgbs_alpha(color0, color1, rgbs_color, output, quant_level);
			
 
				+		retval = FMT_RGB_SCALE_ALPHA;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_LUMINANCE_SMALL_RANGE:
			
 
				+	case FMT_HDR_LUMINANCE_LARGE_RANGE:
			
 
				+		if (try_quantize_hdr_luminance_small_range(color0, color1, output, quant_level))
			
 
				+		{
			
 
				+			retval = FMT_HDR_LUMINANCE_SMALL_RANGE;
			
 
				+			break;
			
 
				+		}
			
 
				+		quantize_hdr_luminance_large_range(color0, color1, output, quant_level);
			
 
				+		retval = FMT_HDR_LUMINANCE_LARGE_RANGE;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_LUMINANCE:
			
 
				+		quantize_luminance(color0, color1, output, quant_level);
			
 
				+		retval = FMT_LUMINANCE;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_LUMINANCE_ALPHA:
			
 
				+		if (quant_level <= 18)
			
 
				+		{
			
 
				+			if (try_quantize_luminance_alpha_delta(color0, color1, output, quant_level))
			
 
				+			{
			
 
				+				retval = FMT_LUMINANCE_ALPHA_DELTA;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		quantize_luminance_alpha(color0, color1, output, quant_level);
			
 
				+		retval = FMT_LUMINANCE_ALPHA;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_RGB_LDR_ALPHA:
			
 
				+		quantize_hdr_rgb_ldr_alpha(color0, color1, output, quant_level);
			
 
				+		retval = FMT_HDR_RGB_LDR_ALPHA;
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_RGBA:
			
 
				+		quantize_hdr_rgb_alpha(color0, color1, output, quant_level);
			
 
				+		retval = FMT_HDR_RGBA;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return retval;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_color_unquantize.cpp
+++ b/thirdparty/astcenc/astcenc_color_unquantize.cpp
@@ -0,0 +1,941 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#include <utility>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for color unquantization.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Un-blue-contract a color.
			
 
				+ *
			
 
				+ * This function reverses any applied blue contraction.
			
 
				+ *
			
 
				+ * @param input   The input color that has been blue-contracted.
			
 
				+ *
			
 
				+ * @return The uncontracted color.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vint4 uncontract_color(
			
 
				+	vint4 input
			
 
				+) {
			
 
				+	vmask4 mask(true, true, false, false);
			
 
				+	vint4 bc0 = asr<1>(input + input.lane<2>());
			
 
				+	return select(input, bc0, mask);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR RGBA color that uses delta encoding.
			
 
				+ *
			
 
				+ * @param      input0    The packed endpoint 0 color.
			
 
				+ * @param      input1    The packed endpoint 1 color deltas.
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void rgba_delta_unpack(
			
 
				+	vint4 input0,
			
 
				+	vint4 input1,
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	// Apply bit transfer
			
 
				+	bit_transfer_signed(input1, input0);
			
 
				+
			
 
				+	// Apply blue-uncontraction if needed
			
 
				+	int rgb_sum = hadd_rgb_s(input1);
			
 
				+	input1 = input1 + input0;
			
 
				+	if (rgb_sum < 0)
			
 
				+	{
			
 
				+		input0 = uncontract_color(input0);
			
 
				+		input1 = uncontract_color(input1);
			
 
				+		std::swap(input0, input1);
			
 
				+	}
			
 
				+
			
 
				+	output0 = clamp(0, 255, input0);
			
 
				+	output1 = clamp(0, 255, input1);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR RGB color that uses delta encoding.
			
 
				+ *
			
 
				+ * Output alpha set to 255.
			
 
				+ *
			
 
				+ * @param      input0    The packed endpoint 0 color.
			
 
				+ * @param      input1    The packed endpoint 1 color deltas.
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void rgb_delta_unpack(
			
 
				+	vint4 input0,
			
 
				+	vint4 input1,
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	rgba_delta_unpack(input0, input1, output0, output1);
			
 
				+	output0.set_lane<3>(255);
			
 
				+	output1.set_lane<3>(255);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR RGBA color that uses direct encoding.
			
 
				+ *
			
 
				+ * @param      input0    The packed endpoint 0 color.
			
 
				+ * @param      input1    The packed endpoint 1 color.
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void rgba_unpack(
			
 
				+	vint4 input0,
			
 
				+	vint4 input1,
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	// Apply blue-uncontraction if needed
			
 
				+	if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
			
 
				+	{
			
 
				+		input0 = uncontract_color(input0);
			
 
				+		input1 = uncontract_color(input1);
			
 
				+		std::swap(input0, input1);
			
 
				+	}
			
 
				+
			
 
				+	output0 = input0;
			
 
				+	output1 = input1;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR RGB color that uses direct encoding.
			
 
				+ *
			
 
				+ * Output alpha set to 255.
			
 
				+ *
			
 
				+ * @param      input0    The packed endpoint 0 color.
			
 
				+ * @param      input1    The packed endpoint 1 color.
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void rgb_unpack(
			
 
				+	vint4 input0,
			
 
				+	vint4 input1,
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	rgba_unpack(input0, input1, output0, output1);
			
 
				+	output0.set_lane<3>(255);
			
 
				+	output1.set_lane<3>(255);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR RGBA color that uses scaled encoding.
			
 
				+ *
			
 
				+ * Note only the RGB channels use the scaled encoding, alpha uses direct.
			
 
				+ *
			
 
				+ * @param      input0    The packed endpoint 0 color.
			
 
				+ * @param      alpha1    The packed endpoint 1 alpha value.
			
 
				+ * @param      scale     The packed quantized scale.
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void rgb_scale_alpha_unpack(
			
 
				+	vint4 input0,
			
 
				+	uint8_t alpha1,
			
 
				+	uint8_t scale,
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	output1 = input0;
			
 
				+	output1.set_lane<3>(alpha1);
			
 
				+
			
 
				+	output0 = asr<8>(input0 * scale);
			
 
				+	output0.set_lane<3>(input0.lane<3>());
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR RGB color that uses scaled encoding.
			
 
				+ *
			
 
				+ * Output alpha is 255.
			
 
				+ *
			
 
				+ * @param      input0    The packed endpoint 0 color.
			
 
				+ * @param      scale     The packed scale.
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void rgb_scale_unpack(
			
 
				+	vint4 input0,
			
 
				+	int scale,
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	output1 = input0;
			
 
				+	output1.set_lane<3>(255);
			
 
				+
			
 
				+	output0 = asr<8>(input0 * scale);
			
 
				+	output0.set_lane<3>(255);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR L color that uses direct encoding.
			
 
				+ *
			
 
				+ * Output alpha is 255.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints.
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void luminance_unpack(
			
 
				+	const uint8_t input[2],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	int lum0 = input[0];
			
 
				+	int lum1 = input[1];
			
 
				+	output0 = vint4(lum0, lum0, lum0, 255);
			
 
				+	output1 = vint4(lum1, lum1, lum1, 255);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR L color that uses delta encoding.
			
 
				+ *
			
 
				+ * Output alpha is 255.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (L0, L1).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void luminance_delta_unpack(
			
 
				+	const uint8_t input[2],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	int v0 = input[0];
			
 
				+	int v1 = input[1];
			
 
				+	int l0 = (v0 >> 2) | (v1 & 0xC0);
			
 
				+	int l1 = l0 + (v1 & 0x3F);
			
 
				+
			
 
				+	l1 = astc::min(l1, 255);
			
 
				+
			
 
				+	output0 = vint4(l0, l0, l0, 255);
			
 
				+	output1 = vint4(l1, l1, l1, 255);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR LA color that uses direct encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (L0, L1, A0, A1).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void luminance_alpha_unpack(
			
 
				+	const uint8_t input[4],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	int lum0 = input[0];
			
 
				+	int lum1 = input[1];
			
 
				+	int alpha0 = input[2];
			
 
				+	int alpha1 = input[3];
			
 
				+	output0 = vint4(lum0, lum0, lum0, alpha0);
			
 
				+	output1 = vint4(lum1, lum1, lum1, alpha1);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an LDR LA color that uses delta encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (L0, L1, A0, A1).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void luminance_alpha_delta_unpack(
			
 
				+	const uint8_t input[4],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	int lum0 = input[0];
			
 
				+	int lum1 = input[1];
			
 
				+	int alpha0 = input[2];
			
 
				+	int alpha1 = input[3];
			
 
				+
			
 
				+	lum0 |= (lum1 & 0x80) << 1;
			
 
				+	alpha0 |= (alpha1 & 0x80) << 1;
			
 
				+	lum1 &= 0x7F;
			
 
				+	alpha1 &= 0x7F;
			
 
				+
			
 
				+	if (lum1 & 0x40)
			
 
				+	{
			
 
				+		lum1 -= 0x80;
			
 
				+	}
			
 
				+
			
 
				+	if (alpha1 & 0x40)
			
 
				+	{
			
 
				+		alpha1 -= 0x80;
			
 
				+	}
			
 
				+
			
 
				+	lum0 >>= 1;
			
 
				+	lum1 >>= 1;
			
 
				+	alpha0 >>= 1;
			
 
				+	alpha1 >>= 1;
			
 
				+	lum1 += lum0;
			
 
				+	alpha1 += alpha0;
			
 
				+
			
 
				+	lum1 = astc::clamp(lum1, 0, 255);
			
 
				+	alpha1 = astc::clamp(alpha1, 0, 255);
			
 
				+
			
 
				+	output0 = vint4(lum0, lum0, lum0, alpha0);
			
 
				+	output1 = vint4(lum1, lum1, lum1, alpha1);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an HDR RGB + offset encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (packed and modal).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void hdr_rgbo_unpack(
			
 
				+	const uint8_t input[4],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	int v0 = input[0];
			
 
				+	int v1 = input[1];
			
 
				+	int v2 = input[2];
			
 
				+	int v3 = input[3];
			
 
				+
			
 
				+	int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
			
 
				+
			
 
				+	int majcomp;
			
 
				+	int mode;
			
 
				+	if ((modeval & 0xC) != 0xC)
			
 
				+	{
			
 
				+		majcomp = modeval >> 2;
			
 
				+		mode = modeval & 3;
			
 
				+	}
			
 
				+	else if (modeval != 0xF)
			
 
				+	{
			
 
				+		majcomp = modeval & 3;
			
 
				+		mode = 4;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		majcomp = 0;
			
 
				+		mode = 5;
			
 
				+	}
			
 
				+
			
 
				+	int red = v0 & 0x3F;
			
 
				+	int green = v1 & 0x1F;
			
 
				+	int blue = v2 & 0x1F;
			
 
				+	int scale = v3 & 0x1F;
			
 
				+
			
 
				+	int bit0 = (v1 >> 6) & 1;
			
 
				+	int bit1 = (v1 >> 5) & 1;
			
 
				+	int bit2 = (v2 >> 6) & 1;
			
 
				+	int bit3 = (v2 >> 5) & 1;
			
 
				+	int bit4 = (v3 >> 7) & 1;
			
 
				+	int bit5 = (v3 >> 6) & 1;
			
 
				+	int bit6 = (v3 >> 5) & 1;
			
 
				+
			
 
				+	int ohcomp = 1 << mode;
			
 
				+
			
 
				+	if (ohcomp & 0x30)
			
 
				+		green |= bit0 << 6;
			
 
				+	if (ohcomp & 0x3A)
			
 
				+		green |= bit1 << 5;
			
 
				+	if (ohcomp & 0x30)
			
 
				+		blue |= bit2 << 6;
			
 
				+	if (ohcomp & 0x3A)
			
 
				+		blue |= bit3 << 5;
			
 
				+
			
 
				+	if (ohcomp & 0x3D)
			
 
				+		scale |= bit6 << 5;
			
 
				+	if (ohcomp & 0x2D)
			
 
				+		scale |= bit5 << 6;
			
 
				+	if (ohcomp & 0x04)
			
 
				+		scale |= bit4 << 7;
			
 
				+
			
 
				+	if (ohcomp & 0x3B)
			
 
				+		red |= bit4 << 6;
			
 
				+	if (ohcomp & 0x04)
			
 
				+		red |= bit3 << 6;
			
 
				+
			
 
				+	if (ohcomp & 0x10)
			
 
				+		red |= bit5 << 7;
			
 
				+	if (ohcomp & 0x0F)
			
 
				+		red |= bit2 << 7;
			
 
				+
			
 
				+	if (ohcomp & 0x05)
			
 
				+		red |= bit1 << 8;
			
 
				+	if (ohcomp & 0x0A)
			
 
				+		red |= bit0 << 8;
			
 
				+
			
 
				+	if (ohcomp & 0x05)
			
 
				+		red |= bit0 << 9;
			
 
				+	if (ohcomp & 0x02)
			
 
				+		red |= bit6 << 9;
			
 
				+
			
 
				+	if (ohcomp & 0x01)
			
 
				+		red |= bit3 << 10;
			
 
				+	if (ohcomp & 0x02)
			
 
				+		red |= bit5 << 10;
			
 
				+
			
 
				+	// expand to 12 bits.
			
 
				+	static const int shamts[6] { 1, 1, 2, 3, 4, 5 };
			
 
				+	int shamt = shamts[mode];
			
 
				+	red <<= shamt;
			
 
				+	green <<= shamt;
			
 
				+	blue <<= shamt;
			
 
				+	scale <<= shamt;
			
 
				+
			
 
				+	// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
			
 
				+	// not absolute values.
			
 
				+	if (mode != 5)
			
 
				+	{
			
 
				+		green = red - green;
			
 
				+		blue = red - blue;
			
 
				+	}
			
 
				+
			
 
				+	// switch around components.
			
 
				+	int temp;
			
 
				+	switch (majcomp)
			
 
				+	{
			
 
				+	case 1:
			
 
				+		temp = red;
			
 
				+		red = green;
			
 
				+		green = temp;
			
 
				+		break;
			
 
				+	case 2:
			
 
				+		temp = red;
			
 
				+		red = blue;
			
 
				+		blue = temp;
			
 
				+		break;
			
 
				+	default:
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	int red0 = red - scale;
			
 
				+	int green0 = green - scale;
			
 
				+	int blue0 = blue - scale;
			
 
				+
			
 
				+	// clamp to [0,0xFFF].
			
 
				+	if (red < 0)
			
 
				+		red = 0;
			
 
				+	if (green < 0)
			
 
				+		green = 0;
			
 
				+	if (blue < 0)
			
 
				+		blue = 0;
			
 
				+
			
 
				+	if (red0 < 0)
			
 
				+		red0 = 0;
			
 
				+	if (green0 < 0)
			
 
				+		green0 = 0;
			
 
				+	if (blue0 < 0)
			
 
				+		blue0 = 0;
			
 
				+
			
 
				+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
			
 
				+	output1 = vint4(red << 4, green << 4, blue << 4, 0x7800);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an HDR RGB direct encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (packed and modal).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void hdr_rgb_unpack(
			
 
				+	const uint8_t input[6],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+
			
 
				+	int v0 = input[0];
			
 
				+	int v1 = input[1];
			
 
				+	int v2 = input[2];
			
 
				+	int v3 = input[3];
			
 
				+	int v4 = input[4];
			
 
				+	int v5 = input[5];
			
 
				+
			
 
				+	// extract all the fixed-placement bitfields
			
 
				+	int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
			
 
				+
			
 
				+	int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
			
 
				+
			
 
				+	if (majcomp == 3)
			
 
				+	{
			
 
				+		output0 = vint4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
			
 
				+		output1 = vint4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	int a = v0 | ((v1 & 0x40) << 2);
			
 
				+	int b0 = v2 & 0x3f;
			
 
				+	int b1 = v3 & 0x3f;
			
 
				+	int c = v1 & 0x3f;
			
 
				+	int d0 = v4 & 0x7f;
			
 
				+	int d1 = v5 & 0x7f;
			
 
				+
			
 
				+	// get hold of the number of bits in 'd0' and 'd1'
			
 
				+	static const int dbits_tab[8] { 7, 6, 7, 6, 5, 6, 5, 6 };
			
 
				+	int dbits = dbits_tab[modeval];
			
 
				+
			
 
				+	// extract six variable-placement bits
			
 
				+	int bit0 = (v2 >> 6) & 1;
			
 
				+	int bit1 = (v3 >> 6) & 1;
			
 
				+	int bit2 = (v4 >> 6) & 1;
			
 
				+	int bit3 = (v5 >> 6) & 1;
			
 
				+	int bit4 = (v4 >> 5) & 1;
			
 
				+	int bit5 = (v5 >> 5) & 1;
			
 
				+
			
 
				+	// and prepend the variable-placement bits depending on mode.
			
 
				+	int ohmod = 1 << modeval;	// one-hot-mode
			
 
				+	if (ohmod & 0xA4)
			
 
				+		a |= bit0 << 9;
			
 
				+	if (ohmod & 0x8)
			
 
				+		a |= bit2 << 9;
			
 
				+	if (ohmod & 0x50)
			
 
				+		a |= bit4 << 9;
			
 
				+
			
 
				+	if (ohmod & 0x50)
			
 
				+		a |= bit5 << 10;
			
 
				+	if (ohmod & 0xA0)
			
 
				+		a |= bit1 << 10;
			
 
				+
			
 
				+	if (ohmod & 0xC0)
			
 
				+		a |= bit2 << 11;
			
 
				+
			
 
				+	if (ohmod & 0x4)
			
 
				+		c |= bit1 << 6;
			
 
				+	if (ohmod & 0xE8)
			
 
				+		c |= bit3 << 6;
			
 
				+
			
 
				+	if (ohmod & 0x20)
			
 
				+		c |= bit2 << 7;
			
 
				+
			
 
				+	if (ohmod & 0x5B)
			
 
				+	{
			
 
				+		b0 |= bit0 << 6;
			
 
				+		b1 |= bit1 << 6;
			
 
				+	}
			
 
				+
			
 
				+	if (ohmod & 0x12)
			
 
				+	{
			
 
				+		b0 |= bit2 << 7;
			
 
				+		b1 |= bit3 << 7;
			
 
				+	}
			
 
				+
			
 
				+	if (ohmod & 0xAF)
			
 
				+	{
			
 
				+		d0 |= bit4 << 5;
			
 
				+		d1 |= bit5 << 5;
			
 
				+	}
			
 
				+
			
 
				+	if (ohmod & 0x5)
			
 
				+	{
			
 
				+		d0 |= bit2 << 6;
			
 
				+		d1 |= bit3 << 6;
			
 
				+	}
			
 
				+
			
 
				+	// sign-extend 'd0' and 'd1'
			
 
				+	// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
			
 
				+	int32_t d0x = d0;
			
 
				+	int32_t d1x = d1;
			
 
				+	int sx_shamt = 32 - dbits;
			
 
				+	d0x <<= sx_shamt;
			
 
				+	d0x >>= sx_shamt;
			
 
				+	d1x <<= sx_shamt;
			
 
				+	d1x >>= sx_shamt;
			
 
				+	d0 = d0x;
			
 
				+	d1 = d1x;
			
 
				+
			
 
				+	// expand all values to 12 bits, with left-shift as needed.
			
 
				+	int val_shamt = (modeval >> 1) ^ 3;
			
 
				+	a <<= val_shamt;
			
 
				+	b0 <<= val_shamt;
			
 
				+	b1 <<= val_shamt;
			
 
				+	c <<= val_shamt;
			
 
				+	d0 <<= val_shamt;
			
 
				+	d1 <<= val_shamt;
			
 
				+
			
 
				+	// then compute the actual color values.
			
 
				+	int red1 = a;
			
 
				+	int green1 = a - b0;
			
 
				+	int blue1 = a - b1;
			
 
				+	int red0 = a - c;
			
 
				+	int green0 = a - b0 - c - d0;
			
 
				+	int blue0 = a - b1 - c - d1;
			
 
				+
			
 
				+	// clamp the color components to [0,2^12 - 1]
			
 
				+	red0 = astc::clamp(red0, 0, 4095);
			
 
				+	green0 = astc::clamp(green0, 0, 4095);
			
 
				+	blue0 = astc::clamp(blue0, 0, 4095);
			
 
				+
			
 
				+	red1 = astc::clamp(red1, 0, 4095);
			
 
				+	green1 = astc::clamp(green1, 0, 4095);
			
 
				+	blue1 = astc::clamp(blue1, 0, 4095);
			
 
				+
			
 
				+	// switch around the color components
			
 
				+	int temp0, temp1;
			
 
				+	switch (majcomp)
			
 
				+	{
			
 
				+	case 1:					// switch around red and green
			
 
				+		temp0 = red0;
			
 
				+		temp1 = red1;
			
 
				+		red0 = green0;
			
 
				+		red1 = green1;
			
 
				+		green0 = temp0;
			
 
				+		green1 = temp1;
			
 
				+		break;
			
 
				+	case 2:					// switch around red and blue
			
 
				+		temp0 = red0;
			
 
				+		temp1 = red1;
			
 
				+		red0 = blue0;
			
 
				+		red1 = blue1;
			
 
				+		blue0 = temp0;
			
 
				+		blue1 = temp1;
			
 
				+		break;
			
 
				+	case 0:					// no switch
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
			
 
				+	output1 = vint4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an HDR RGB + LDR A direct encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (packed and modal).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void hdr_rgb_ldr_alpha_unpack(
			
 
				+	const uint8_t input[8],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	hdr_rgb_unpack(input, output0, output1);
			
 
				+
			
 
				+	int v6 = input[6];
			
 
				+	int v7 = input[7];
			
 
				+	output0.set_lane<3>(v6);
			
 
				+	output1.set_lane<3>(v7);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an HDR L (small range) direct encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (packed and modal).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void hdr_luminance_small_range_unpack(
			
 
				+	const uint8_t input[2],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	int v0 = input[0];
			
 
				+	int v1 = input[1];
			
 
				+
			
 
				+	int y0, y1;
			
 
				+	if (v0 & 0x80)
			
 
				+	{
			
 
				+		y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
			
 
				+		y1 = (v1 & 0x1F) << 2;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
			
 
				+		y1 = (v1 & 0xF) << 1;
			
 
				+	}
			
 
				+
			
 
				+	y1 += y0;
			
 
				+	if (y1 > 0xFFF)
			
 
				+	{
			
 
				+		y1 = 0xFFF;
			
 
				+	}
			
 
				+
			
 
				+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
			
 
				+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an HDR L (large range) direct encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (packed and modal).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void hdr_luminance_large_range_unpack(
			
 
				+	const uint8_t input[2],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	int v0 = input[0];
			
 
				+	int v1 = input[1];
			
 
				+
			
 
				+	int y0, y1;
			
 
				+	if (v1 >= v0)
			
 
				+	{
			
 
				+		y0 = v0 << 4;
			
 
				+		y1 = v1 << 4;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		y0 = (v1 << 4) + 8;
			
 
				+		y1 = (v0 << 4) - 8;
			
 
				+	}
			
 
				+
			
 
				+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
			
 
				+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an HDR A direct encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (packed and modal).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void hdr_alpha_unpack(
			
 
				+	const uint8_t input[2],
			
 
				+	int& output0,
			
 
				+	int& output1
			
 
				+) {
			
 
				+
			
 
				+	int v6 = input[0];
			
 
				+	int v7 = input[1];
			
 
				+
			
 
				+	int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
			
 
				+	v6 &= 0x7F;
			
 
				+	v7 &= 0x7F;
			
 
				+	if (selector == 3)
			
 
				+	{
			
 
				+		output0 = v6 << 5;
			
 
				+		output1 = v7 << 5;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		v6 |= (v7 << (selector + 1)) & 0x780;
			
 
				+		v7 &= (0x3f >> selector);
			
 
				+		v7 ^= 32 >> selector;
			
 
				+		v7 -= 32 >> selector;
			
 
				+		v6 <<= (4 - selector);
			
 
				+		v7 <<= (4 - selector);
			
 
				+		v7 += v6;
			
 
				+
			
 
				+		if (v7 < 0)
			
 
				+		{
			
 
				+			v7 = 0;
			
 
				+		}
			
 
				+		else if (v7 > 0xFFF)
			
 
				+		{
			
 
				+			v7 = 0xFFF;
			
 
				+		}
			
 
				+
			
 
				+		output0 = v6;
			
 
				+		output1 = v7;
			
 
				+	}
			
 
				+
			
 
				+	output0 <<= 4;
			
 
				+	output1 <<= 4;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack an HDR RGBA direct encoding.
			
 
				+ *
			
 
				+ * @param      input     The packed endpoints (packed and modal).
			
 
				+ * @param[out] output0   The unpacked endpoint 0 color.
			
 
				+ * @param[out] output1   The unpacked endpoint 1 color.
			
 
				+ */
			
 
				+static void hdr_rgb_hdr_alpha_unpack(
			
 
				+	const uint8_t input[8],
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	hdr_rgb_unpack(input, output0, output1);
			
 
				+
			
 
				+	int alpha0, alpha1;
			
 
				+	hdr_alpha_unpack(input + 6, alpha0, alpha1);
			
 
				+
			
 
				+	output0.set_lane<3>(alpha0);
			
 
				+	output1.set_lane<3>(alpha1);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void unpack_color_endpoints(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	int format,
			
 
				+	const uint8_t* input,
			
 
				+	bool& rgb_hdr,
			
 
				+	bool& alpha_hdr,
			
 
				+	vint4& output0,
			
 
				+	vint4& output1
			
 
				+) {
			
 
				+	// Assume no NaNs and LDR endpoints unless set later
			
 
				+	rgb_hdr = false;
			
 
				+	alpha_hdr = false;
			
 
				+
			
 
				+	bool alpha_hdr_default = false;
			
 
				+
			
 
				+	switch (format)
			
 
				+	{
			
 
				+	case FMT_LUMINANCE:
			
 
				+		luminance_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_LUMINANCE_DELTA:
			
 
				+		luminance_delta_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_LUMINANCE_SMALL_RANGE:
			
 
				+		rgb_hdr = true;
			
 
				+		alpha_hdr_default = true;
			
 
				+		hdr_luminance_small_range_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_LUMINANCE_LARGE_RANGE:
			
 
				+		rgb_hdr = true;
			
 
				+		alpha_hdr_default = true;
			
 
				+		hdr_luminance_large_range_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_LUMINANCE_ALPHA:
			
 
				+		luminance_alpha_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_LUMINANCE_ALPHA_DELTA:
			
 
				+		luminance_alpha_delta_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGB_SCALE:
			
 
				+		{
			
 
				+			vint4 input0q(input[0], input[1], input[2], 0);
			
 
				+			uint8_t scale = input[3];
			
 
				+			rgb_scale_unpack(input0q, scale, output0, output1);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGB_SCALE_ALPHA:
			
 
				+		{
			
 
				+			vint4 input0q(input[0], input[1], input[2], input[4]);
			
 
				+			uint8_t alpha1q = input[5];
			
 
				+			uint8_t scaleq = input[3];
			
 
				+			rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_RGB_SCALE:
			
 
				+		rgb_hdr = true;
			
 
				+		alpha_hdr_default = true;
			
 
				+		hdr_rgbo_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGB:
			
 
				+		{
			
 
				+			vint4 input0q(input[0], input[2], input[4], 0);
			
 
				+			vint4 input1q(input[1], input[3], input[5], 0);
			
 
				+			rgb_unpack(input0q, input1q, output0, output1);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGB_DELTA:
			
 
				+		{
			
 
				+			vint4 input0q(input[0], input[2], input[4], 0);
			
 
				+			vint4 input1q(input[1], input[3], input[5], 0);
			
 
				+			rgb_delta_unpack(input0q, input1q, output0, output1);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_RGB:
			
 
				+		rgb_hdr = true;
			
 
				+		alpha_hdr_default = true;
			
 
				+		hdr_rgb_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGBA:
			
 
				+		{
			
 
				+			vint4 input0q(input[0], input[2], input[4], input[6]);
			
 
				+			vint4 input1q(input[1], input[3], input[5], input[7]);
			
 
				+			rgba_unpack(input0q, input1q, output0, output1);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_RGBA_DELTA:
			
 
				+		{
			
 
				+			vint4 input0q(input[0], input[2], input[4], input[6]);
			
 
				+			vint4 input1q(input[1], input[3], input[5], input[7]);
			
 
				+			rgba_delta_unpack(input0q, input1q, output0, output1);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_RGB_LDR_ALPHA:
			
 
				+		rgb_hdr = true;
			
 
				+		hdr_rgb_ldr_alpha_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+
			
 
				+	case FMT_HDR_RGBA:
			
 
				+		rgb_hdr = true;
			
 
				+		alpha_hdr = true;
			
 
				+		hdr_rgb_hdr_alpha_unpack(input, output0, output1);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	// Assign a correct default alpha
			
 
				+	if (alpha_hdr_default)
			
 
				+	{
			
 
				+		if (decode_mode == ASTCENC_PRF_HDR)
			
 
				+		{
			
 
				+			output0.set_lane<3>(0x7800);
			
 
				+			output1.set_lane<3>(0x7800);
			
 
				+			alpha_hdr = true;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			output0.set_lane<3>(0x00FF);
			
 
				+			output1.set_lane<3>(0x00FF);
			
 
				+			alpha_hdr = false;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	vint4 ldr_scale(257);
			
 
				+	vint4 hdr_scale(1);
			
 
				+	vint4 output_scale = ldr_scale;
			
 
				+
			
 
				+	// An LDR profile image
			
 
				+	if ((decode_mode == ASTCENC_PRF_LDR) ||
			
 
				+	    (decode_mode == ASTCENC_PRF_LDR_SRGB))
			
 
				+	{
			
 
				+		// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
			
 
				+		if (rgb_hdr == true)
			
 
				+		{
			
 
				+			output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
			
 
				+			output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
			
 
				+			output_scale = hdr_scale;
			
 
				+
			
 
				+			rgb_hdr = false;
			
 
				+			alpha_hdr = false;
			
 
				+		}
			
 
				+	}
			
 
				+	// An HDR profile image
			
 
				+	else
			
 
				+	{
			
 
				+		vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
			
 
				+		output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
			
 
				+	}
			
 
				+
			
 
				+	output0 = output0 * output_scale;
			
 
				+	output1 = output1 * output_scale;
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_compress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_compress_symbolic.cpp
@@ -0,0 +1,1455 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions to compress a symbolic block.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+#include "astcenc_diagnostic_trace.h"
			
 
				+
			
 
				+#include <cassert>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Merge two planes of endpoints into a single vector.
			
 
				+ *
			
 
				+ * @param      ep_plane1          The endpoints for plane 1.
			
 
				+ * @param      ep_plane2          The endpoints for plane 2.
			
 
				+ * @param      component_plane2   The color component for plane 2.
			
 
				+ * @param[out] result             The merged output.
			
 
				+ */
			
 
				+static void merge_endpoints(
			
 
				+	const endpoints& ep_plane1,
			
 
				+	const endpoints& ep_plane2,
			
 
				+	unsigned int component_plane2,
			
 
				+	endpoints& result
			
 
				+) {
			
 
				+	unsigned int partition_count = ep_plane1.partition_count;
			
 
				+	assert(partition_count == 1);
			
 
				+
			
 
				+	vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
			
 
				+
			
 
				+	result.partition_count = partition_count;
			
 
				+	result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
			
 
				+	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Attempt to improve weights given a chosen configuration.
			
 
				+ *
			
 
				+ * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
			
 
				+ * partition and per plane) and attempt to improve image quality by moving each weight up by one or
			
 
				+ * down by one quantization step.
			
 
				+ *
			
 
				+ * This is a specialized function which only supports operating on undecimated weight grids,
			
 
				+ * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
			
 
				+ * is needed less often.
			
 
				+ *
			
 
				+ * @param      decode_mode   The decode mode (LDR, HDR).
			
 
				+ * @param      bsd           The block size information.
			
 
				+ * @param      blk           The image block color data to compress.
			
 
				+ * @param[out] scb           The symbolic compressed block output.
			
 
				+ */
			
 
				+static bool realign_weights_undecimated(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	symbolic_compressed_block& scb
			
 
				+) {
			
 
				+	// Get the partition descriptor
			
 
				+	unsigned int partition_count = scb.partition_count;
			
 
				+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
			
 
				+
			
 
				+	// Get the quantization table
			
 
				+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+	unsigned int weight_quant_level = bm.quant_mode;
			
 
				+	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
			
 
				+
			
 
				+	unsigned int max_plane = bm.is_dual_plane;
			
 
				+	int plane2_component = scb.plane2_component;
			
 
				+	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
			
 
				+
			
 
				+	// Decode the color endpoints
			
 
				+	bool rgb_hdr;
			
 
				+	bool alpha_hdr;
			
 
				+	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
			
 
				+	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
			
 
				+	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
			
 
				+	vfloat4 offset[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
			
 
				+	{
			
 
				+		unpack_color_endpoints(decode_mode,
			
 
				+		                       scb.color_formats[pa_idx],
			
 
				+		                       scb.color_values[pa_idx],
			
 
				+		                       rgb_hdr, alpha_hdr,
			
 
				+		                       endpnt0[pa_idx],
			
 
				+		                       endpnt1[pa_idx]);
			
 
				+	}
			
 
				+
			
 
				+	uint8_t* dec_weights_uquant = scb.weights;
			
 
				+	bool adjustments = false;
			
 
				+
			
 
				+	// For each plane and partition ...
			
 
				+	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
			
 
				+	{
			
 
				+		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
			
 
				+		{
			
 
				+			// Compute the endpoint delta for all components in current plane
			
 
				+			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
			
 
				+			epd = select(epd, vint4::zero(), plane_mask);
			
 
				+
			
 
				+			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
			
 
				+			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
			
 
				+		}
			
 
				+
			
 
				+		// For each weight compute previous, current, and next errors
			
 
				+		promise(bsd.texel_count > 0);
			
 
				+		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
			
 
				+		{
			
 
				+			int uqw = dec_weights_uquant[texel];
			
 
				+
			
 
				+			uint32_t prev_and_next = qat.prev_next_values[uqw];
			
 
				+			int uqw_down = prev_and_next & 0xFF;
			
 
				+			int uqw_up = (prev_and_next >> 8) & 0xFF;
			
 
				+
			
 
				+			// Interpolate the colors to create the diffs
			
 
				+			float weight_base = static_cast<float>(uqw);
			
 
				+			float weight_down = static_cast<float>(uqw_down - uqw);
			
 
				+			float weight_up = static_cast<float>(uqw_up - uqw);
			
 
				+
			
 
				+			unsigned int partition = pi.partition_of_texel[texel];
			
 
				+			vfloat4 color_offset = offset[partition];
			
 
				+			vfloat4 color_base   = endpnt0f[partition];
			
 
				+
			
 
				+			vfloat4 color = color_base + color_offset * weight_base;
			
 
				+			vfloat4 orig_color   = blk.texel(texel);
			
 
				+			vfloat4 error_weight = blk.channel_weight;
			
 
				+
			
 
				+			vfloat4 color_diff      = color - orig_color;
			
 
				+			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
			
 
				+			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
			
 
				+
			
 
				+			float error_base = dot_s(color_diff      * color_diff,      error_weight);
			
 
				+			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
			
 
				+			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
			
 
				+
			
 
				+			// Check if the prev or next error is better, and if so use it
			
 
				+			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
			
 
				+			{
			
 
				+				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
			
 
				+				adjustments = true;
			
 
				+			}
			
 
				+			else if ((error_down < error_base) && (uqw > 0))
			
 
				+			{
			
 
				+				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
			
 
				+				adjustments = true;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Prepare iteration for plane 2
			
 
				+		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
			
 
				+		plane_mask = ~plane_mask;
			
 
				+	}
			
 
				+
			
 
				+	return adjustments;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Attempt to improve weights given a chosen configuration.
			
 
				+ *
			
 
				+ * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
			
 
				+ * partition and per plane) and attempt to improve image quality by moving each weight up by one or
			
 
				+ * down by one quantization step.
			
 
				+ *
			
 
				+ * @param      decode_mode   The decode mode (LDR, HDR).
			
 
				+ * @param      bsd           The block size information.
			
 
				+ * @param      blk           The image block color data to compress.
			
 
				+ * @param[out] scb           The symbolic compressed block output.
			
 
				+ */
			
 
				+static bool realign_weights_decimated(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	symbolic_compressed_block& scb
			
 
				+) {
			
 
				+	// Get the partition descriptor
			
 
				+	unsigned int partition_count = scb.partition_count;
			
 
				+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
			
 
				+
			
 
				+	// Get the quantization table
			
 
				+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+	unsigned int weight_quant_level = bm.quant_mode;
			
 
				+	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
			
 
				+
			
 
				+	// Get the decimation table
			
 
				+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				+	unsigned int weight_count = di.weight_count;
			
 
				+	assert(weight_count != bsd.texel_count);
			
 
				+
			
 
				+	unsigned int max_plane = bm.is_dual_plane;
			
 
				+	int plane2_component = scb.plane2_component;
			
 
				+	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
			
 
				+
			
 
				+	// Decode the color endpoints
			
 
				+	bool rgb_hdr;
			
 
				+	bool alpha_hdr;
			
 
				+	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
			
 
				+	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
			
 
				+	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
			
 
				+	vfloat4 offset[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	promise(partition_count > 0);
			
 
				+	promise(weight_count > 0);
			
 
				+
			
 
				+	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
			
 
				+	{
			
 
				+		unpack_color_endpoints(decode_mode,
			
 
				+		                       scb.color_formats[pa_idx],
			
 
				+		                       scb.color_values[pa_idx],
			
 
				+		                       rgb_hdr, alpha_hdr,
			
 
				+		                       endpnt0[pa_idx],
			
 
				+		                       endpnt1[pa_idx]);
			
 
				+	}
			
 
				+
			
 
				+	uint8_t* dec_weights_uquant = scb.weights;
			
 
				+	bool adjustments = false;
			
 
				+
			
 
				+	// For each plane and partition ...
			
 
				+	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
			
 
				+	{
			
 
				+		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
			
 
				+		{
			
 
				+			// Compute the endpoint delta for all components in current plane
			
 
				+			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
			
 
				+			epd = select(epd, vint4::zero(), plane_mask);
			
 
				+
			
 
				+			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
			
 
				+			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
			
 
				+		}
			
 
				+
			
 
				+		// Create an unquantized weight grid for this decimation level
			
 
				+		alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
			
 
				+		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint unquant_value(dec_weights_uquant + we_idx);
			
 
				+			vfloat unquant_valuef = int_to_float(unquant_value);
			
 
				+			storea(unquant_valuef, uq_weightsf + we_idx);
			
 
				+		}
			
 
				+
			
 
				+		// For each weight compute previous, current, and next errors
			
 
				+		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
			
 
				+		{
			
 
				+			int uqw = dec_weights_uquant[we_idx];
			
 
				+			uint32_t prev_and_next = qat.prev_next_values[uqw];
			
 
				+
			
 
				+			float uqw_base = uq_weightsf[we_idx];
			
 
				+			float uqw_down = static_cast<float>(prev_and_next & 0xFF);
			
 
				+			float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
			
 
				+
			
 
				+			float uqw_diff_down = uqw_down - uqw_base;
			
 
				+			float uqw_diff_up = uqw_up - uqw_base;
			
 
				+
			
 
				+			vfloat4 error_basev = vfloat4::zero();
			
 
				+			vfloat4 error_downv = vfloat4::zero();
			
 
				+			vfloat4 error_upv = vfloat4::zero();
			
 
				+
			
 
				+			// Interpolate the colors to create the diffs
			
 
				+			unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
			
 
				+			promise(texels_to_evaluate > 0);
			
 
				+			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
			
 
				+			{
			
 
				+				unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
			
 
				+
			
 
				+				float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
			
 
				+
			
 
				+				float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
			
 
				+				                   + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
			
 
				+					              + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
			
 
				+				                   + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
			
 
				+
			
 
				+				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
			
 
				+				// float weight = astc::flt_rd(weight_base + 0.5f);
			
 
				+				// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
			
 
				+				// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
			
 
				+				float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
			
 
				+				float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
			
 
				+
			
 
				+				unsigned int partition = pi.partition_of_texel[texel];
			
 
				+				vfloat4 color_offset = offset[partition];
			
 
				+				vfloat4 color_base   = endpnt0f[partition];
			
 
				+
			
 
				+				vfloat4 color = color_base + color_offset * weight_base;
			
 
				+				vfloat4 orig_color = blk.texel(texel);
			
 
				+
			
 
				+				vfloat4 color_diff      = color - orig_color;
			
 
				+				vfloat4 color_down_diff = color_diff + color_offset * weight_down;
			
 
				+				vfloat4 color_up_diff   = color_diff + color_offset * weight_up;
			
 
				+
			
 
				+				error_basev += color_diff * color_diff;
			
 
				+				error_downv += color_down_diff * color_down_diff;
			
 
				+				error_upv   += color_up_diff * color_up_diff;
			
 
				+			}
			
 
				+
			
 
				+			vfloat4 error_weight = blk.channel_weight;
			
 
				+			float error_base = hadd_s(error_basev * error_weight);
			
 
				+			float error_down = hadd_s(error_downv * error_weight);
			
 
				+			float error_up   = hadd_s(error_upv   * error_weight);
			
 
				+
			
 
				+			// Check if the prev or next error is better, and if so use it
			
 
				+			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
			
 
				+			{
			
 
				+				uq_weightsf[we_idx] = uqw_up;
			
 
				+				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
			
 
				+				adjustments = true;
			
 
				+			}
			
 
				+			else if ((error_down < error_base) && (uqw > 0))
			
 
				+			{
			
 
				+				uq_weightsf[we_idx] = uqw_down;
			
 
				+				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
			
 
				+				adjustments = true;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Prepare iteration for plane 2
			
 
				+		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
			
 
				+		plane_mask = ~plane_mask;
			
 
				+	}
			
 
				+
			
 
				+	return adjustments;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compress a block using a chosen partitioning and 1 plane of weights.
			
 
				+ *
			
 
				+ * @param      config                    The compressor configuration.
			
 
				+ * @param      bsd                       The block size information.
			
 
				+ * @param      blk                       The image block color data to compress.
			
 
				+ * @param      only_always               True if we only use "always" percentile block modes.
			
 
				+ * @param      tune_errorval_threshold   The error value threshold.
			
 
				+ * @param      partition_count           The partition count.
			
 
				+ * @param      partition_index           The partition index if @c partition_count is 2-4.
			
 
				+ * @param[out] scb                       The symbolic compressed block output.
			
 
				+ * @param[out] tmpbuf                    The quantized weights for plane 1.
			
 
				+ */
			
 
				+static float compress_symbolic_block_for_partition_1plane(
			
 
				+	const astcenc_config& config,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	bool only_always,
			
 
				+	float tune_errorval_threshold,
			
 
				+	unsigned int partition_count,
			
 
				+	unsigned int partition_index,
			
 
				+	symbolic_compressed_block& scb,
			
 
				+	compression_working_buffers& tmpbuf,
			
 
				+	int quant_limit
			
 
				+) {
			
 
				+	promise(partition_count > 0);
			
 
				+	promise(config.tune_candidate_limit > 0);
			
 
				+	promise(config.tune_refinement_limit > 0);
			
 
				+
			
 
				+	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
			
 
				+
			
 
				+	auto compute_difference = &compute_symbolic_block_difference_1plane;
			
 
				+	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
			
 
				+	{
			
 
				+		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
			
 
				+	}
			
 
				+
			
 
				+	const auto& pi = bsd.get_partition_info(partition_count, partition_index);
			
 
				+
			
 
				+	// Compute ideal weights and endpoint colors, with no quantization or decimation
			
 
				+	endpoints_and_weights& ei = tmpbuf.ei1;
			
 
				+	compute_ideal_colors_and_weights_1plane(blk, pi, ei);
			
 
				+
			
 
				+	// Compute ideal weights and endpoint colors for every decimation
			
 
				+	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
			
 
				+	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
			
 
				+
			
 
				+	// For each decimation mode, compute an ideal set of weights with no quantization
			
 
				+	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
			
 
				+	                                                : bsd.decimation_mode_count_selected;
			
 
				+	promise(max_decimation_modes > 0);
			
 
				+	for (unsigned int i = 0; i < max_decimation_modes; i++)
			
 
				+	{
			
 
				+		const auto& dm = bsd.get_decimation_mode(i);
			
 
				+		if (!dm.is_ref_1_plane(static_cast<quant_method>(max_weight_quant)))
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		const auto& di = bsd.get_decimation_info(i);
			
 
				+
			
 
				+		compute_ideal_weights_for_decimation(
			
 
				+		    ei,
			
 
				+		    di,
			
 
				+		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
			
 
				+	}
			
 
				+
			
 
				+	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
			
 
				+	// weight pair, compute the smallest weight that will result in a color value greater than 1
			
 
				+	vfloat4 min_ep(10.0f);
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
			
 
				+
			
 
				+		vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
			
 
				+		min_ep = select(min_ep, ep, use_ep);
			
 
				+	}
			
 
				+
			
 
				+	float min_wt_cutoff = hmin_s(min_ep);
			
 
				+
			
 
				+	// For each mode, use the angular method to compute a shift
			
 
				+	compute_angular_endpoints_1plane(
			
 
				+	    only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
			
 
				+
			
 
				+	float* weight_low_value = tmpbuf.weight_low_value1;
			
 
				+	float* weight_high_value = tmpbuf.weight_high_value1;
			
 
				+	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
			
 
				+	float* qwt_errors = tmpbuf.qwt_errors;
			
 
				+
			
 
				+	// For each mode (which specifies a decimation and a quantization):
			
 
				+	//     * Compute number of bits needed for the quantized weights
			
 
				+	//     * Generate an optimized set of quantized weights
			
 
				+	//     * Compute quantization errors for the mode
			
 
				+
			
 
				+
			
 
				+	static const int8_t free_bits_for_partition_count[4] {
			
 
				+		115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
			
 
				+	};
			
 
				+
			
 
				+	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
			
 
				+	                                           : bsd.block_mode_count_1plane_selected;
			
 
				+	promise(max_block_modes > 0);
			
 
				+	for (unsigned int i = 0; i < max_block_modes; i++)
			
 
				+	{
			
 
				+		const block_mode& bm = bsd.block_modes[i];
			
 
				+
			
 
				+		if (bm.quant_mode > max_weight_quant)
			
 
				+		{
			
 
				+			qwt_errors[i] = 1e38f;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		assert(!bm.is_dual_plane);
			
 
				+		int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
			
 
				+		if (bitcount <= 0)
			
 
				+		{
			
 
				+			qwt_errors[i] = 1e38f;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
			
 
				+		{
			
 
				+			weight_high_value[i] = 1.0f;
			
 
				+		}
			
 
				+
			
 
				+		int decimation_mode = bm.decimation_mode;
			
 
				+		const auto& di = bsd.get_decimation_info(decimation_mode);
			
 
				+
			
 
				+		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
			
 
				+
			
 
				+		alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+		// Generate the optimized set of weights for the weight mode
			
 
				+		compute_quantized_weights_for_decimation(
			
 
				+		    di,
			
 
				+		    weight_low_value[i], weight_high_value[i],
			
 
				+		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
			
 
				+		    dec_weights_uquantf,
			
 
				+		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
			
 
				+		    bm.get_weight_quant_mode());
			
 
				+
			
 
				+		// Compute weight quantization errors for the block mode
			
 
				+		qwt_errors[i] = compute_error_of_weight_set_1plane(
			
 
				+		    ei,
			
 
				+		    di,
			
 
				+		    dec_weights_uquantf);
			
 
				+	}
			
 
				+
			
 
				+	// Decide the optimal combination of color endpoint encodings and weight encodings
			
 
				+	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
			
 
				+	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
			
 
				+
			
 
				+	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
			
 
				+	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
			
 
				+
			
 
				+	unsigned int candidate_count = compute_ideal_endpoint_formats(
			
 
				+	    pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
			
 
				+	    config.tune_candidate_limit, 0, max_block_modes,
			
 
				+	    partition_format_specifiers, block_mode_index,
			
 
				+	    color_quant_level, color_quant_level_mod, tmpbuf);
			
 
				+
			
 
				+	// Iterate over the N believed-to-be-best modes to find out which one is actually best
			
 
				+	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
			
 
				+	float best_errorval_in_scb = scb.errorval;
			
 
				+
			
 
				+	for (unsigned int i = 0; i < candidate_count; i++)
			
 
				+	{
			
 
				+		TRACE_NODE(node0, "candidate");
			
 
				+
			
 
				+		const int bm_packed_index = block_mode_index[i];
			
 
				+		assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
			
 
				+		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
			
 
				+
			
 
				+		int decimation_mode = qw_bm.decimation_mode;
			
 
				+		const auto& di = bsd.get_decimation_info(decimation_mode);
			
 
				+		promise(di.weight_count > 0);
			
 
				+
			
 
				+		trace_add_data("weight_x", di.weight_x);
			
 
				+		trace_add_data("weight_y", di.weight_y);
			
 
				+		trace_add_data("weight_z", di.weight_z);
			
 
				+		trace_add_data("weight_quant", qw_bm.quant_mode);
			
 
				+
			
 
				+		// Recompute the ideal color endpoints before storing them
			
 
				+		vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
			
 
				+		vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+		symbolic_compressed_block workscb;
			
 
				+		endpoints workep = ei.ep;
			
 
				+
			
 
				+		uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
			
 
				+
			
 
				+		for (unsigned int j = 0; j < di.weight_count; j++)
			
 
				+		{
			
 
				+			workscb.weights[j] = u8_weight_src[j];
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
			
 
				+		{
			
 
				+			recompute_ideal_colors_1plane(
			
 
				+			    blk, pi, di, workscb.weights,
			
 
				+			    workep, rgbs_colors, rgbo_colors);
			
 
				+
			
 
				+			// Quantize the chosen color, tracking if worth trying the mod value
			
 
				+			bool all_same = color_quant_level[i] != color_quant_level_mod[i];
			
 
				+			for (unsigned int j = 0; j < partition_count; j++)
			
 
				+			{
			
 
				+				workscb.color_formats[j] = pack_color_endpoints(
			
 
				+				    workep.endpt0[j],
			
 
				+				    workep.endpt1[j],
			
 
				+				    rgbs_colors[j],
			
 
				+				    rgbo_colors[j],
			
 
				+				    partition_format_specifiers[i][j],
			
 
				+				    workscb.color_values[j],
			
 
				+				    color_quant_level[i]);
			
 
				+
			
 
				+				all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
			
 
				+			}
			
 
				+
			
 
				+			// If all the color endpoint modes are the same, we get a few more bits to store colors;
			
 
				+			// let's see if we can take advantage of this: requantize all the colors and see if the
			
 
				+			// endpoint modes remain the same.
			
 
				+			workscb.color_formats_matched = 0;
			
 
				+			if (partition_count >= 2 && all_same)
			
 
				+			{
			
 
				+				uint8_t colorvals[BLOCK_MAX_PARTITIONS][12];
			
 
				+				uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
			
 
				+				bool all_same_mod = true;
			
 
				+				for (unsigned int j = 0; j < partition_count; j++)
			
 
				+				{
			
 
				+					color_formats_mod[j] = pack_color_endpoints(
			
 
				+					    workep.endpt0[j],
			
 
				+					    workep.endpt1[j],
			
 
				+					    rgbs_colors[j],
			
 
				+					    rgbo_colors[j],
			
 
				+					    partition_format_specifiers[i][j],
			
 
				+					    colorvals[j],
			
 
				+					    color_quant_level_mod[i]);
			
 
				+
			
 
				+					// Early out as soon as it's no longer possible to use mod
			
 
				+					if (color_formats_mod[j] != color_formats_mod[0])
			
 
				+					{
			
 
				+						all_same_mod = false;
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				if (all_same_mod)
			
 
				+				{
			
 
				+					workscb.color_formats_matched = 1;
			
 
				+					for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
			
 
				+					{
			
 
				+						for (unsigned int k = 0; k < 8; k++)
			
 
				+						{
			
 
				+							workscb.color_values[j][k] = colorvals[j][k];
			
 
				+						}
			
 
				+
			
 
				+						workscb.color_formats[j] = color_formats_mod[j];
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			// Store header fields
			
 
				+			workscb.partition_count = static_cast<uint8_t>(partition_count);
			
 
				+			workscb.partition_index = static_cast<uint16_t>(partition_index);
			
 
				+			workscb.plane2_component = -1;
			
 
				+			workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
			
 
				+			workscb.block_mode = qw_bm.mode_index;
			
 
				+			workscb.block_type = SYM_BTYPE_NONCONST;
			
 
				+
			
 
				+			// Pre-realign test
			
 
				+			if (l == 0)
			
 
				+			{
			
 
				+				float errorval = compute_difference(config, bsd, workscb, blk);
			
 
				+				if (errorval == -ERROR_CALC_DEFAULT)
			
 
				+				{
			
 
				+					errorval = -errorval;
			
 
				+					workscb.block_type = SYM_BTYPE_ERROR;
			
 
				+				}
			
 
				+
			
 
				+				trace_add_data("error_prerealign", errorval);
			
 
				+				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
			
 
				+
			
 
				+				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
			
 
				+				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
			
 
				+				// drive a heuristic to skip blocks that are unlikely to catch up with the best
			
 
				+				// block we have already.
			
 
				+				unsigned int iters_remaining = config.tune_refinement_limit - l;
			
 
				+				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
			
 
				+				if (errorval > (threshold * best_errorval_in_scb))
			
 
				+				{
			
 
				+					break;
			
 
				+				}
			
 
				+
			
 
				+				if (errorval < best_errorval_in_scb)
			
 
				+				{
			
 
				+					best_errorval_in_scb = errorval;
			
 
				+					workscb.errorval = errorval;
			
 
				+					scb = workscb;
			
 
				+
			
 
				+					if (errorval < tune_errorval_threshold)
			
 
				+					{
			
 
				+						// Skip remaining candidates - this is "good enough"
			
 
				+						i = candidate_count;
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			bool adjustments;
			
 
				+			if (di.weight_count != bsd.texel_count)
			
 
				+			{
			
 
				+				adjustments = realign_weights_decimated(
			
 
				+					config.profile, bsd, blk, workscb);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				adjustments = realign_weights_undecimated(
			
 
				+					config.profile, bsd, blk, workscb);
			
 
				+			}
			
 
				+
			
 
				+			// Post-realign test
			
 
				+			float errorval = compute_difference(config, bsd, workscb, blk);
			
 
				+			if (errorval == -ERROR_CALC_DEFAULT)
			
 
				+			{
			
 
				+				errorval = -errorval;
			
 
				+				workscb.block_type = SYM_BTYPE_ERROR;
			
 
				+			}
			
 
				+
			
 
				+			trace_add_data("error_postrealign", errorval);
			
 
				+			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
			
 
				+
			
 
				+			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
			
 
				+			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
			
 
				+			// give benefit of the doubt ...
			
 
				+			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
			
 
				+			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
			
 
				+			if (errorval > (threshold * best_errorval_in_scb))
			
 
				+			{
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			if (errorval < best_errorval_in_scb)
			
 
				+			{
			
 
				+				best_errorval_in_scb = errorval;
			
 
				+				workscb.errorval = errorval;
			
 
				+				scb = workscb;
			
 
				+
			
 
				+				if (errorval < tune_errorval_threshold)
			
 
				+				{
			
 
				+					// Skip remaining candidates - this is "good enough"
			
 
				+					i = candidate_count;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (!adjustments)
			
 
				+			{
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return best_errorval_in_mode;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compress a block using a chosen partitioning and 2 planes of weights.
			
 
				+ *
			
 
				+ * @param      config                    The compressor configuration.
			
 
				+ * @param      bsd                       The block size information.
			
 
				+ * @param      blk                       The image block color data to compress.
			
 
				+ * @param      tune_errorval_threshold   The error value threshold.
			
 
				+ * @param      plane2_component          The component index for the second plane of weights.
			
 
				+ * @param[out] scb                       The symbolic compressed block output.
			
 
				+ * @param[out] tmpbuf                    The quantized weights for plane 1.
			
 
				+ */
			
 
				+static float compress_symbolic_block_for_partition_2planes(
			
 
				+	const astcenc_config& config,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	float tune_errorval_threshold,
			
 
				+	unsigned int plane2_component,
			
 
				+	symbolic_compressed_block& scb,
			
 
				+	compression_working_buffers& tmpbuf,
			
 
				+	int quant_limit
			
 
				+) {
			
 
				+	promise(config.tune_candidate_limit > 0);
			
 
				+	promise(config.tune_refinement_limit > 0);
			
 
				+	promise(bsd.decimation_mode_count_selected > 0);
			
 
				+
			
 
				+	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
			
 
				+
			
 
				+	// Compute ideal weights and endpoint colors, with no quantization or decimation
			
 
				+	endpoints_and_weights& ei1 = tmpbuf.ei1;
			
 
				+	endpoints_and_weights& ei2 = tmpbuf.ei2;
			
 
				+
			
 
				+	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
			
 
				+
			
 
				+	// Compute ideal weights and endpoint colors for every decimation
			
 
				+	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
			
 
				+	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
			
 
				+
			
 
				+	// For each decimation mode, compute an ideal set of weights with no quantization
			
 
				+	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
			
 
				+	{
			
 
				+		const auto& dm = bsd.get_decimation_mode(i);
			
 
				+		if (!dm.is_ref_2_plane(static_cast<quant_method>(max_weight_quant)))
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		const auto& di = bsd.get_decimation_info(i);
			
 
				+
			
 
				+		compute_ideal_weights_for_decimation(
			
 
				+		    ei1,
			
 
				+		    di,
			
 
				+		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
			
 
				+
			
 
				+		compute_ideal_weights_for_decimation(
			
 
				+		    ei2,
			
 
				+		    di,
			
 
				+		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
			
 
				+	}
			
 
				+
			
 
				+	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
			
 
				+	// weight pair, compute the smallest weight that will result in a color value greater than 1
			
 
				+	vfloat4 min_ep1(10.0f);
			
 
				+	vfloat4 min_ep2(10.0f);
			
 
				+
			
 
				+	vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
			
 
				+	vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
			
 
				+	min_ep1 = select(min_ep1, ep1, use_ep1);
			
 
				+
			
 
				+	vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
			
 
				+	vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
			
 
				+	min_ep2 = select(min_ep2, ep2, use_ep2);
			
 
				+
			
 
				+	vfloat4 err_max(ERROR_CALC_DEFAULT);
			
 
				+	vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
			
 
				+
			
 
				+	// Set the plane2 component to max error in ep1
			
 
				+	min_ep1 = select(min_ep1, err_max, err_mask);
			
 
				+
			
 
				+	float min_wt_cutoff1 = hmin_s(min_ep1);
			
 
				+
			
 
				+	// Set the minwt2 to the plane2 component min in ep2
			
 
				+	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
			
 
				+
			
 
				+	compute_angular_endpoints_2planes(
			
 
				+	    bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
			
 
				+
			
 
				+	// For each mode (which specifies a decimation and a quantization):
			
 
				+	//     * Compute number of bits needed for the quantized weights
			
 
				+	//     * Generate an optimized set of quantized weights
			
 
				+	//     * Compute quantization errors for the mode
			
 
				+
			
 
				+	float* weight_low_value1 = tmpbuf.weight_low_value1;
			
 
				+	float* weight_high_value1 = tmpbuf.weight_high_value1;
			
 
				+	float* weight_low_value2 = tmpbuf.weight_low_value2;
			
 
				+	float* weight_high_value2 = tmpbuf.weight_high_value2;
			
 
				+
			
 
				+	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
			
 
				+	float* qwt_errors = tmpbuf.qwt_errors;
			
 
				+
			
 
				+	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
			
 
				+	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
			
 
				+
			
 
				+	for (unsigned int i = start_2plane; i < end_2plane; i++)
			
 
				+	{
			
 
				+		const block_mode& bm = bsd.block_modes[i];
			
 
				+		assert(bm.is_dual_plane);
			
 
				+
			
 
				+		if (bm.quant_mode > max_weight_quant)
			
 
				+		{
			
 
				+			qwt_errors[i] = 1e38f;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
			
 
				+
			
 
				+		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
			
 
				+		{
			
 
				+			weight_high_value1[i] = 1.0f;
			
 
				+		}
			
 
				+
			
 
				+		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
			
 
				+		{
			
 
				+			weight_high_value2[i] = 1.0f;
			
 
				+		}
			
 
				+
			
 
				+		unsigned int decimation_mode = bm.decimation_mode;
			
 
				+		const auto& di = bsd.get_decimation_info(decimation_mode);
			
 
				+
			
 
				+		alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+		// Generate the optimized set of weights for the mode
			
 
				+		compute_quantized_weights_for_decimation(
			
 
				+		    di,
			
 
				+		    weight_low_value1[i],
			
 
				+		    weight_high_value1[i],
			
 
				+		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
			
 
				+		    dec_weights_uquantf,
			
 
				+		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
			
 
				+		    bm.get_weight_quant_mode());
			
 
				+
			
 
				+		compute_quantized_weights_for_decimation(
			
 
				+		    di,
			
 
				+		    weight_low_value2[i],
			
 
				+		    weight_high_value2[i],
			
 
				+		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
			
 
				+		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
			
 
				+		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
			
 
				+		    bm.get_weight_quant_mode());
			
 
				+
			
 
				+		// Compute weight quantization errors for the block mode
			
 
				+		qwt_errors[i] = compute_error_of_weight_set_2planes(
			
 
				+		    ei1,
			
 
				+		    ei2,
			
 
				+		    di,
			
 
				+		    dec_weights_uquantf,
			
 
				+		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
			
 
				+	}
			
 
				+
			
 
				+	// Decide the optimal combination of color endpoint encodings and weight encodings
			
 
				+	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
			
 
				+	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
			
 
				+
			
 
				+	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
			
 
				+	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
			
 
				+
			
 
				+	endpoints epm;
			
 
				+	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
			
 
				+
			
 
				+	const auto& pi = bsd.get_partition_info(1, 0);
			
 
				+	unsigned int candidate_count = compute_ideal_endpoint_formats(
			
 
				+	    pi, blk, epm, qwt_bitcounts, qwt_errors,
			
 
				+	    config.tune_candidate_limit,
			
 
				+		bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
			
 
				+	    partition_format_specifiers, block_mode_index,
			
 
				+	    color_quant_level, color_quant_level_mod, tmpbuf);
			
 
				+
			
 
				+	// Iterate over the N believed-to-be-best modes to find out which one is actually best
			
 
				+	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
			
 
				+	float best_errorval_in_scb = scb.errorval;
			
 
				+
			
 
				+	for (unsigned int i = 0; i < candidate_count; i++)
			
 
				+	{
			
 
				+		TRACE_NODE(node0, "candidate");
			
 
				+
			
 
				+		const int bm_packed_index = block_mode_index[i];
			
 
				+		assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
			
 
				+		       bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
			
 
				+		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
			
 
				+
			
 
				+		int decimation_mode = qw_bm.decimation_mode;
			
 
				+		const auto& di = bsd.get_decimation_info(decimation_mode);
			
 
				+		promise(di.weight_count > 0);
			
 
				+
			
 
				+		trace_add_data("weight_x", di.weight_x);
			
 
				+		trace_add_data("weight_y", di.weight_y);
			
 
				+		trace_add_data("weight_z", di.weight_z);
			
 
				+		trace_add_data("weight_quant", qw_bm.quant_mode);
			
 
				+
			
 
				+		vfloat4 rgbs_color;
			
 
				+		vfloat4 rgbo_color;
			
 
				+
			
 
				+		symbolic_compressed_block workscb;
			
 
				+		endpoints workep = epm;
			
 
				+
			
 
				+		uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
			
 
				+		uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
			
 
				+
			
 
				+		for (int j = 0; j < di.weight_count; j++)
			
 
				+		{
			
 
				+			workscb.weights[j] = u8_weight1_src[j];
			
 
				+			workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
			
 
				+		{
			
 
				+			recompute_ideal_colors_2planes(
			
 
				+			    blk, bsd, di,
			
 
				+			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
			
 
				+			    workep, rgbs_color, rgbo_color, plane2_component);
			
 
				+
			
 
				+			// Quantize the chosen color
			
 
				+			workscb.color_formats[0] = pack_color_endpoints(
			
 
				+			                               workep.endpt0[0],
			
 
				+			                               workep.endpt1[0],
			
 
				+			                               rgbs_color, rgbo_color,
			
 
				+			                               partition_format_specifiers[i][0],
			
 
				+			                               workscb.color_values[0],
			
 
				+			                               color_quant_level[i]);
			
 
				+
			
 
				+			// Store header fields
			
 
				+			workscb.partition_count = 1;
			
 
				+			workscb.partition_index = 0;
			
 
				+			workscb.quant_mode = color_quant_level[i];
			
 
				+			workscb.color_formats_matched = 0;
			
 
				+			workscb.block_mode = qw_bm.mode_index;
			
 
				+			workscb.plane2_component = static_cast<int8_t>(plane2_component);
			
 
				+			workscb.block_type = SYM_BTYPE_NONCONST;
			
 
				+
			
 
				+			// Pre-realign test
			
 
				+			if (l == 0)
			
 
				+			{
			
 
				+				float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
			
 
				+				if (errorval == -ERROR_CALC_DEFAULT)
			
 
				+				{
			
 
				+					errorval = -errorval;
			
 
				+					workscb.block_type = SYM_BTYPE_ERROR;
			
 
				+				}
			
 
				+
			
 
				+				trace_add_data("error_prerealign", errorval);
			
 
				+				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
			
 
				+
			
 
				+				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
			
 
				+				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
			
 
				+				// drive a heuristic to skip blocks that are unlikely to catch up with the best
			
 
				+				// block we have already.
			
 
				+				unsigned int iters_remaining = config.tune_refinement_limit - l;
			
 
				+				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
			
 
				+				if (errorval > (threshold * best_errorval_in_scb))
			
 
				+				{
			
 
				+					break;
			
 
				+				}
			
 
				+
			
 
				+				if (errorval < best_errorval_in_scb)
			
 
				+				{
			
 
				+					best_errorval_in_scb = errorval;
			
 
				+					workscb.errorval = errorval;
			
 
				+					scb = workscb;
			
 
				+
			
 
				+					if (errorval < tune_errorval_threshold)
			
 
				+					{
			
 
				+						// Skip remaining candidates - this is "good enough"
			
 
				+						i = candidate_count;
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			// Perform a final pass over the weights to try to improve them.
			
 
				+			bool adjustments;
			
 
				+			if (di.weight_count != bsd.texel_count)
			
 
				+			{
			
 
				+				adjustments = realign_weights_decimated(
			
 
				+					config.profile, bsd, blk, workscb);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				adjustments = realign_weights_undecimated(
			
 
				+					config.profile, bsd, blk, workscb);
			
 
				+			}
			
 
				+
			
 
				+			// Post-realign test
			
 
				+			float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
			
 
				+			if (errorval == -ERROR_CALC_DEFAULT)
			
 
				+			{
			
 
				+				errorval = -errorval;
			
 
				+				workscb.block_type = SYM_BTYPE_ERROR;
			
 
				+			}
			
 
				+
			
 
				+			trace_add_data("error_postrealign", errorval);
			
 
				+			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
			
 
				+
			
 
				+			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
			
 
				+			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
			
 
				+			// give benefit of the doubt ...
			
 
				+			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
			
 
				+			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
			
 
				+			if (errorval > (threshold * best_errorval_in_scb))
			
 
				+			{
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			if (errorval < best_errorval_in_scb)
			
 
				+			{
			
 
				+				best_errorval_in_scb = errorval;
			
 
				+				workscb.errorval = errorval;
			
 
				+				scb = workscb;
			
 
				+
			
 
				+				if (errorval < tune_errorval_threshold)
			
 
				+				{
			
 
				+					// Skip remaining candidates - this is "good enough"
			
 
				+					i = candidate_count;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (!adjustments)
			
 
				+			{
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return best_errorval_in_mode;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Determine the lowest cross-channel correlation factor.
			
 
				+ *
			
 
				+ * @param texels_per_block   The number of texels in a block.
			
 
				+ * @param blk                The image block color data to compress.
			
 
				+ *
			
 
				+ * @return Return the lowest correlation factor.
			
 
				+ */
			
 
				+static float prepare_block_statistics(
			
 
				+	int texels_per_block,
			
 
				+	const image_block& blk
			
 
				+) {
			
 
				+	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
			
 
				+	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
			
 
				+	float rs = 0.0f;
			
 
				+	float gs = 0.0f;
			
 
				+	float bs = 0.0f;
			
 
				+	float as = 0.0f;
			
 
				+	float rr_var = 0.0f;
			
 
				+	float gg_var = 0.0f;
			
 
				+	float bb_var = 0.0f;
			
 
				+	float aa_var = 0.0f;
			
 
				+	float rg_cov = 0.0f;
			
 
				+	float rb_cov = 0.0f;
			
 
				+	float ra_cov = 0.0f;
			
 
				+	float gb_cov = 0.0f;
			
 
				+	float ga_cov = 0.0f;
			
 
				+	float ba_cov = 0.0f;
			
 
				+
			
 
				+	float weight_sum = 0.0f;
			
 
				+
			
 
				+	promise(texels_per_block > 0);
			
 
				+	for (int i = 0; i < texels_per_block; i++)
			
 
				+	{
			
 
				+		float weight = hadd_s(blk.channel_weight) / 4.0f;
			
 
				+		assert(weight >= 0.0f);
			
 
				+		weight_sum += weight;
			
 
				+
			
 
				+		float r = blk.data_r[i];
			
 
				+		float g = blk.data_g[i];
			
 
				+		float b = blk.data_b[i];
			
 
				+		float a = blk.data_a[i];
			
 
				+
			
 
				+		float rw = r * weight;
			
 
				+		rs += rw;
			
 
				+		rr_var += r * rw;
			
 
				+		rg_cov += g * rw;
			
 
				+		rb_cov += b * rw;
			
 
				+		ra_cov += a * rw;
			
 
				+
			
 
				+		float gw = g * weight;
			
 
				+		gs += gw;
			
 
				+		gg_var += g * gw;
			
 
				+		gb_cov += b * gw;
			
 
				+		ga_cov += a * gw;
			
 
				+
			
 
				+		float bw = b * weight;
			
 
				+		bs += bw;
			
 
				+		bb_var += b * bw;
			
 
				+		ba_cov += a * bw;
			
 
				+
			
 
				+		float aw = a * weight;
			
 
				+		as += aw;
			
 
				+		aa_var += a * aw;
			
 
				+	}
			
 
				+
			
 
				+	float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
			
 
				+
			
 
				+	rr_var -= rs * (rs * rpt);
			
 
				+	rg_cov -= gs * (rs * rpt);
			
 
				+	rb_cov -= bs * (rs * rpt);
			
 
				+	ra_cov -= as * (rs * rpt);
			
 
				+
			
 
				+	gg_var -= gs * (gs * rpt);
			
 
				+	gb_cov -= bs * (gs * rpt);
			
 
				+	ga_cov -= as * (gs * rpt);
			
 
				+
			
 
				+	bb_var -= bs * (bs * rpt);
			
 
				+	ba_cov -= as * (bs * rpt);
			
 
				+
			
 
				+	aa_var -= as * (as * rpt);
			
 
				+
			
 
				+	// These will give a NaN if a channel is constant - these are fixed up in the next step
			
 
				+	rg_cov *= astc::rsqrt(rr_var * gg_var);
			
 
				+	rb_cov *= astc::rsqrt(rr_var * bb_var);
			
 
				+	ra_cov *= astc::rsqrt(rr_var * aa_var);
			
 
				+	gb_cov *= astc::rsqrt(gg_var * bb_var);
			
 
				+	ga_cov *= astc::rsqrt(gg_var * aa_var);
			
 
				+	ba_cov *= astc::rsqrt(bb_var * aa_var);
			
 
				+
			
 
				+	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
			
 
				+	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
			
 
				+	if (astc::isnan(ra_cov)) ra_cov = 1.0f;
			
 
				+	if (astc::isnan(gb_cov)) gb_cov = 1.0f;
			
 
				+	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
			
 
				+	if (astc::isnan(ba_cov)) ba_cov = 1.0f;
			
 
				+
			
 
				+	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
			
 
				+	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
			
 
				+	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
			
 
				+	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
			
 
				+	lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));
			
 
				+
			
 
				+	// Diagnostic trace points
			
 
				+	trace_add_data("min_r", blk.data_min.lane<0>());
			
 
				+	trace_add_data("max_r", blk.data_max.lane<0>());
			
 
				+	trace_add_data("min_g", blk.data_min.lane<1>());
			
 
				+	trace_add_data("max_g", blk.data_max.lane<1>());
			
 
				+	trace_add_data("min_b", blk.data_min.lane<2>());
			
 
				+	trace_add_data("max_b", blk.data_max.lane<2>());
			
 
				+	trace_add_data("min_a", blk.data_min.lane<3>());
			
 
				+	trace_add_data("max_a", blk.data_max.lane<3>());
			
 
				+	trace_add_data("cov_rg", fabsf(rg_cov));
			
 
				+	trace_add_data("cov_rb", fabsf(rb_cov));
			
 
				+	trace_add_data("cov_ra", fabsf(ra_cov));
			
 
				+	trace_add_data("cov_gb", fabsf(gb_cov));
			
 
				+	trace_add_data("cov_ga", fabsf(ga_cov));
			
 
				+	trace_add_data("cov_ba", fabsf(ba_cov));
			
 
				+
			
 
				+	return lowest_correlation;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compress_block(
			
 
				+	const astcenc_contexti& ctx,
			
 
				+	const image_block& blk,
			
 
				+	physical_compressed_block& pcb,
			
 
				+	compression_working_buffers& tmpbuf)
			
 
				+{
			
 
				+	astcenc_profile decode_mode = ctx.config.profile;
			
 
				+	symbolic_compressed_block scb;
			
 
				+	const block_size_descriptor& bsd = *ctx.bsd;
			
 
				+	float lowest_correl;
			
 
				+
			
 
				+	TRACE_NODE(node0, "block");
			
 
				+	trace_add_data("pos_x", blk.xpos);
			
 
				+	trace_add_data("pos_y", blk.ypos);
			
 
				+	trace_add_data("pos_z", blk.zpos);
			
 
				+
			
 
				+	// Set stricter block targets for luminance data as we have more bits to play with
			
 
				+	bool block_is_l = blk.is_luminance();
			
 
				+	float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
			
 
				+
			
 
				+	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
			
 
				+	bool block_is_la = blk.is_luminancealpha();
			
 
				+	float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
			
 
				+
			
 
				+	bool block_skip_two_plane = false;
			
 
				+	int max_partitions = ctx.config.tune_partition_count_limit;
			
 
				+
			
 
				+	unsigned int requested_partition_indices[3] {
			
 
				+		ctx.config.tune_2partition_index_limit,
			
 
				+		ctx.config.tune_3partition_index_limit,
			
 
				+		ctx.config.tune_4partition_index_limit
			
 
				+	};
			
 
				+
			
 
				+	unsigned int requested_partition_trials[3] {
			
 
				+		ctx.config.tune_2partitioning_candidate_limit,
			
 
				+		ctx.config.tune_3partitioning_candidate_limit,
			
 
				+		ctx.config.tune_4partitioning_candidate_limit
			
 
				+	};
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	// Do this early in diagnostic builds so we can dump uniform metrics
			
 
				+	// for every block. Do it later in release builds to avoid redundant work!
			
 
				+	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
			
 
				+	float error_threshold = ctx.config.tune_db_limit
			
 
				+	                      * error_weight_sum
			
 
				+	                      * block_is_l_scale
			
 
				+	                      * block_is_la_scale;
			
 
				+
			
 
				+	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
			
 
				+	trace_add_data("lowest_correl", lowest_correl);
			
 
				+	trace_add_data("tune_error_threshold", error_threshold);
			
 
				+#endif
			
 
				+
			
 
				+	// Detected a constant-color block
			
 
				+	if (all(blk.data_min == blk.data_max))
			
 
				+	{
			
 
				+		TRACE_NODE(node1, "pass");
			
 
				+		trace_add_data("partition_count", 0);
			
 
				+		trace_add_data("plane_count", 1);
			
 
				+
			
 
				+		scb.partition_count = 0;
			
 
				+
			
 
				+		// Encode as FP16 if using HDR
			
 
				+		if ((decode_mode == ASTCENC_PRF_HDR) ||
			
 
				+		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
			
 
				+		{
			
 
				+			scb.block_type = SYM_BTYPE_CONST_F16;
			
 
				+			vint4 color_f16 = float_to_float16(blk.origin_texel);
			
 
				+			store(color_f16, scb.constant_color);
			
 
				+		}
			
 
				+		// Encode as UNORM16 if NOT using HDR
			
 
				+		else
			
 
				+		{
			
 
				+			scb.block_type = SYM_BTYPE_CONST_U16;
			
 
				+			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
			
 
				+			vint4 color_u16 = float_to_int_rtn(color_f32);
			
 
				+			store(color_u16, scb.constant_color);
			
 
				+		}
			
 
				+
			
 
				+		trace_add_data("exit", "quality hit");
			
 
				+
			
 
				+		symbolic_to_physical(bsd, scb, pcb);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+#if !defined(ASTCENC_DIAGNOSTICS)
			
 
				+	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
			
 
				+	float error_threshold = ctx.config.tune_db_limit
			
 
				+	                      * error_weight_sum
			
 
				+	                      * block_is_l_scale
			
 
				+	                      * block_is_la_scale;
			
 
				+#endif
			
 
				+
			
 
				+	// Set SCB and mode errors to a very high error value
			
 
				+	scb.errorval = ERROR_CALC_DEFAULT;
			
 
				+	scb.block_type = SYM_BTYPE_ERROR;
			
 
				+
			
 
				+	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
			
 
				+		ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
			
 
				+	};
			
 
				+
			
 
				+	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
			
 
				+		0.0f,
			
 
				+		ctx.config.tune_2_partition_early_out_limit_factor,
			
 
				+		ctx.config.tune_3_partition_early_out_limit_factor,
			
 
				+		0.0f
			
 
				+	};
			
 
				+
			
 
				+	// Trial using 1 plane of weights and 1 partition.
			
 
				+
			
 
				+	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
			
 
				+	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
			
 
				+	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
			
 
				+	// compression and slightly reduces image quality.
			
 
				+
			
 
				+	float errorval_mult[2] {
			
 
				+		1.0f / ctx.config.tune_mse_overshoot,
			
 
				+		1.0f
			
 
				+	};
			
 
				+
			
 
				+	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
			
 
				+
			
 
				+	// Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels
			
 
				+	int start_trial = 1;
			
 
				+	if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
			
 
				+	{
			
 
				+		start_trial = 0;
			
 
				+	}
			
 
				+
			
 
				+	int quant_limit = QUANT_32;
			
 
				+	for (int i = start_trial; i < 2; i++)
			
 
				+	{
			
 
				+		TRACE_NODE(node1, "pass");
			
 
				+		trace_add_data("partition_count", 1);
			
 
				+		trace_add_data("plane_count", 1);
			
 
				+		trace_add_data("search_mode", i);
			
 
				+
			
 
				+		float errorval = compress_symbolic_block_for_partition_1plane(
			
 
				+		    ctx.config, bsd, blk, i == 0,
			
 
				+		    error_threshold * errorval_mult[i] * errorval_overshoot,
			
 
				+		    1, 0,  scb, tmpbuf, QUANT_32);
			
 
				+
			
 
				+		// Record the quant level so we can use the filter later searches
			
 
				+		const auto& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+		quant_limit = bm.get_weight_quant_mode();
			
 
				+
			
 
				+		best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
			
 
				+		if (errorval < (error_threshold * errorval_mult[i]))
			
 
				+		{
			
 
				+			trace_add_data("exit", "quality hit");
			
 
				+			goto END_OF_TESTS;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+#if !defined(ASTCENC_DIAGNOSTICS)
			
 
				+	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
			
 
				+#endif
			
 
				+
			
 
				+	block_skip_two_plane = lowest_correl > ctx.config.tune_2_plane_early_out_limit_correlation;
			
 
				+
			
 
				+	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
			
 
				+	// alpha is the most likely to be non-correlated if it is present in the data.
			
 
				+	for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
			
 
				+	{
			
 
				+		TRACE_NODE(node1, "pass");
			
 
				+		trace_add_data("partition_count", 1);
			
 
				+		trace_add_data("plane_count", 2);
			
 
				+		trace_add_data("plane_component", i);
			
 
				+
			
 
				+		if (block_skip_two_plane)
			
 
				+		{
			
 
				+			trace_add_data("skip", "tune_2_plane_early_out_limit_correlation");
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (blk.grayscale && i != 3)
			
 
				+		{
			
 
				+			trace_add_data("skip", "grayscale block");
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (blk.is_constant_channel(i))
			
 
				+		{
			
 
				+			trace_add_data("skip", "constant component");
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		float errorval = compress_symbolic_block_for_partition_2planes(
			
 
				+		    ctx.config, bsd, blk, error_threshold * errorval_overshoot,
			
 
				+		    i, scb, tmpbuf, quant_limit);
			
 
				+
			
 
				+		// If attempting two planes is much worse than the best one plane result
			
 
				+		// then further two plane searches are unlikely to help so move on ...
			
 
				+		if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (errorval < error_threshold)
			
 
				+		{
			
 
				+			trace_add_data("exit", "quality hit");
			
 
				+			goto END_OF_TESTS;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Find best blocks for 2, 3 and 4 partitions
			
 
				+	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
			
 
				+	{
			
 
				+		unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
			
 
				+
			
 
				+		unsigned int requested_indices = requested_partition_indices[partition_count - 2];
			
 
				+
			
 
				+		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
			
 
				+		requested_trials = astc::min(requested_trials, requested_indices);
			
 
				+
			
 
				+		unsigned int actual_trials = find_best_partition_candidates(
			
 
				+		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
			
 
				+
			
 
				+		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
			
 
				+
			
 
				+		for (unsigned int i = 0; i < actual_trials; i++)
			
 
				+		{
			
 
				+			TRACE_NODE(node1, "pass");
			
 
				+			trace_add_data("partition_count", partition_count);
			
 
				+			trace_add_data("partition_index", partition_indices[i]);
			
 
				+			trace_add_data("plane_count", 1);
			
 
				+			trace_add_data("search_mode", i);
			
 
				+
			
 
				+			float errorval = compress_symbolic_block_for_partition_1plane(
			
 
				+			    ctx.config, bsd, blk, false,
			
 
				+			    error_threshold * errorval_overshoot,
			
 
				+			    partition_count, partition_indices[i],
			
 
				+			    scb, tmpbuf, quant_limit);
			
 
				+
			
 
				+			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
			
 
				+
			
 
				+			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
			
 
				+			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
			
 
				+			// aligns with a partitioning that suits that encoding, so for this inner loop check add
			
 
				+			// a large error scale because the "other" trial could be a lot better.
			
 
				+			float best_error = best_errorvals_for_pcount[partition_count - 1];
			
 
				+			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
			
 
				+			if (best_error > (best_error_in_prev * best_error_scale))
			
 
				+			{
			
 
				+				trace_add_data("skip", "tune_partition_early_out_limit_factor");
			
 
				+				goto END_OF_TESTS;
			
 
				+			}
			
 
				+
			
 
				+			if (errorval < error_threshold)
			
 
				+			{
			
 
				+				trace_add_data("exit", "quality hit");
			
 
				+				goto END_OF_TESTS;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
			
 
				+		float best_error = best_errorvals_for_pcount[partition_count - 1];
			
 
				+		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
			
 
				+		if (best_error > (best_error_in_prev * best_error_scale))
			
 
				+		{
			
 
				+			trace_add_data("skip", "tune_partition_early_out_limit_factor");
			
 
				+			goto END_OF_TESTS;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	trace_add_data("exit", "quality not hit");
			
 
				+
			
 
				+END_OF_TESTS:
			
 
				+	// If we still have an error block then convert to something we can encode
			
 
				+	// TODO: Do something more sensible here, such as average color block
			
 
				+	if (scb.block_type == SYM_BTYPE_ERROR)
			
 
				+	{
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+		static bool printed_once = false;
			
 
				+		if (!printed_once)
			
 
				+		{
			
 
				+			printed_once = true;
			
 
				+			printf("WARN: At least one block failed to find a valid encoding.\n"
			
 
				+			       "      Try increasing compression quality settings.\n\n");
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				+		scb.block_type = SYM_BTYPE_CONST_U16;
			
 
				+		vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
			
 
				+		vint4 color_u16 = float_to_int_rtn(color_f32);
			
 
				+		store(color_u16, scb.constant_color);
			
 
				+	}
			
 
				+
			
 
				+	// Compress to a physical block
			
 
				+	symbolic_to_physical(bsd, scb, pcb);
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_compute_variance.cpp
+++ b/thirdparty/astcenc/astcenc_compute_variance.cpp
@@ -0,0 +1,472 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions to calculate variance per component in a NxN footprint.
			
 
				+ *
			
 
				+ * We need N to be parametric, so the routine below uses summed area tables in order to execute in
			
 
				+ * O(1) time independent of how big N is.
			
 
				+ *
			
 
				+ * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
			
 
				+ * perform a binary reduction, and then distributes the results. This method means that there is no
			
 
				+ * serial dependency between a given element and the next one, and also significantly improves
			
 
				+ * numerical stability allowing us to use floats rather than doubles.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#include <cassert>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
			
 
				+ *
			
 
				+ * This will take an input array of the form:
			
 
				+ *     v0, v1, v2, ...
			
 
				+ * ... and modify in-place to turn it into a prefix-sum array of the form:
			
 
				+ *     v0, v0+v1, v0+v1+v2, ...
			
 
				+ *
			
 
				+ * @param d      The array to prefix-sum.
			
 
				+ * @param items  The number of items in the array.
			
 
				+ * @param stride The item spacing in the array; i.e. dense arrays should use 1.
			
 
				+ */
			
 
				+static void brent_kung_prefix_sum(
			
 
				+	vfloat4* d,
			
 
				+	size_t items,
			
 
				+	int stride
			
 
				+) {
			
 
				+	if (items < 2)
			
 
				+		return;
			
 
				+
			
 
				+	size_t lc_stride = 2;
			
 
				+	size_t log2_stride = 1;
			
 
				+
			
 
				+	// The reduction-tree loop
			
 
				+	do {
			
 
				+		size_t step = lc_stride >> 1;
			
 
				+		size_t start = lc_stride - 1;
			
 
				+		size_t iters = items >> log2_stride;
			
 
				+
			
 
				+		vfloat4 *da = d + (start * stride);
			
 
				+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
			
 
				+		size_t ofs_stride = stride << log2_stride;
			
 
				+
			
 
				+		while (iters)
			
 
				+		{
			
 
				+			*da = *da + da[ofs];
			
 
				+			da += ofs_stride;
			
 
				+			iters--;
			
 
				+		}
			
 
				+
			
 
				+		log2_stride += 1;
			
 
				+		lc_stride <<= 1;
			
 
				+	} while (lc_stride <= items);
			
 
				+
			
 
				+	// The expansion-tree loop
			
 
				+	do {
			
 
				+		log2_stride -= 1;
			
 
				+		lc_stride >>= 1;
			
 
				+
			
 
				+		size_t step = lc_stride >> 1;
			
 
				+		size_t start = step + lc_stride - 1;
			
 
				+		size_t iters = (items - step) >> log2_stride;
			
 
				+
			
 
				+		vfloat4 *da = d + (start * stride);
			
 
				+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
			
 
				+		size_t ofs_stride = stride << log2_stride;
			
 
				+
			
 
				+		while (iters)
			
 
				+		{
			
 
				+			*da = *da + da[ofs];
			
 
				+			da += ofs_stride;
			
 
				+			iters--;
			
 
				+		}
			
 
				+	} while (lc_stride > 2);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_pixel_region_variance(
			
 
				+	astcenc_contexti& ctx,
			
 
				+	const pixel_region_args& arg
			
 
				+) {
			
 
				+	// Unpack the memory structure into local variables
			
 
				+	const astcenc_image* img = arg.img;
			
 
				+	astcenc_swizzle swz = arg.swz;
			
 
				+	bool have_z = arg.have_z;
			
 
				+
			
 
				+	int size_x = arg.size_x;
			
 
				+	int size_y = arg.size_y;
			
 
				+	int size_z = arg.size_z;
			
 
				+
			
 
				+	int offset_x = arg.offset_x;
			
 
				+	int offset_y = arg.offset_y;
			
 
				+	int offset_z = arg.offset_z;
			
 
				+
			
 
				+	int alpha_kernel_radius = arg.alpha_kernel_radius;
			
 
				+
			
 
				+	float*   input_alpha_averages = ctx.input_alpha_averages;
			
 
				+	vfloat4* work_memory = arg.work_memory;
			
 
				+
			
 
				+	// Compute memory sizes and dimensions that we need
			
 
				+	int kernel_radius = alpha_kernel_radius;
			
 
				+	int kerneldim = 2 * kernel_radius + 1;
			
 
				+	int kernel_radius_xy = kernel_radius;
			
 
				+	int kernel_radius_z = have_z ? kernel_radius : 0;
			
 
				+
			
 
				+	int padsize_x = size_x + kerneldim;
			
 
				+	int padsize_y = size_y + kerneldim;
			
 
				+	int padsize_z = size_z + (have_z ? kerneldim : 0);
			
 
				+	int sizeprod = padsize_x * padsize_y * padsize_z;
			
 
				+
			
 
				+	int zd_start = have_z ? 1 : 0;
			
 
				+
			
 
				+	vfloat4 *varbuf1 = work_memory;
			
 
				+	vfloat4 *varbuf2 = work_memory + sizeprod;
			
 
				+
			
 
				+	// Scaling factors to apply to Y and Z for accesses into the work buffers
			
 
				+	int yst = padsize_x;
			
 
				+	int zst = padsize_x * padsize_y;
			
 
				+
			
 
				+	// Scaling factors to apply to Y and Z for accesses into result buffers
			
 
				+	int ydt = img->dim_x;
			
 
				+	int zdt = img->dim_x * img->dim_y;
			
 
				+
			
 
				+	// Macros to act as accessor functions for the work-memory
			
 
				+	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
			
 
				+	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
			
 
				+
			
 
				+	// Load N and N^2 values into the work buffers
			
 
				+	if (img->data_type == ASTCENC_TYPE_U8)
			
 
				+	{
			
 
				+		// Swizzle data structure 4 = ZERO, 5 = ONE
			
 
				+		uint8_t data[6];
			
 
				+		data[ASTCENC_SWZ_0] = 0;
			
 
				+		data[ASTCENC_SWZ_1] = 255;
			
 
				+
			
 
				+		for (int z = zd_start; z < padsize_z; z++)
			
 
				+		{
			
 
				+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
			
 
				+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
			
 
				+			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
			
 
				+
			
 
				+			for (int y = 1; y < padsize_y; y++)
			
 
				+			{
			
 
				+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
			
 
				+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
			
 
				+
			
 
				+				for (int x = 1; x < padsize_x; x++)
			
 
				+				{
			
 
				+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
			
 
				+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
			
 
				+
			
 
				+					data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src    )];
			
 
				+					data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
			
 
				+					data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
			
 
				+					data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
			
 
				+
			
 
				+					uint8_t r = data[swz.r];
			
 
				+					uint8_t g = data[swz.g];
			
 
				+					uint8_t b = data[swz.b];
			
 
				+					uint8_t a = data[swz.a];
			
 
				+
			
 
				+					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
			
 
				+					                     g * (1.0f / 255.0f),
			
 
				+					                     b * (1.0f / 255.0f),
			
 
				+					                     a * (1.0f / 255.0f));
			
 
				+
			
 
				+					VARBUF1(z, y, x) = d;
			
 
				+					VARBUF2(z, y, x) = d * d;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else if (img->data_type == ASTCENC_TYPE_F16)
			
 
				+	{
			
 
				+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
			
 
				+		uint16_t data[6];
			
 
				+		data[ASTCENC_SWZ_0] = 0;
			
 
				+		data[ASTCENC_SWZ_1] = 0x3C00;
			
 
				+
			
 
				+		for (int z = zd_start; z < padsize_z; z++)
			
 
				+		{
			
 
				+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
			
 
				+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
			
 
				+			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
			
 
				+
			
 
				+			for (int y = 1; y < padsize_y; y++)
			
 
				+			{
			
 
				+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
			
 
				+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
			
 
				+
			
 
				+				for (int x = 1; x < padsize_x; x++)
			
 
				+				{
			
 
				+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
			
 
				+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
			
 
				+
			
 
				+					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
			
 
				+					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
			
 
				+					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
			
 
				+					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
			
 
				+
			
 
				+					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
			
 
				+					vfloat4 d = float16_to_float(di);
			
 
				+
			
 
				+					VARBUF1(z, y, x) = d;
			
 
				+					VARBUF2(z, y, x) = d * d;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else // if (img->data_type == ASTCENC_TYPE_F32)
			
 
				+	{
			
 
				+		assert(img->data_type == ASTCENC_TYPE_F32);
			
 
				+
			
 
				+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
			
 
				+		float data[6];
			
 
				+		data[ASTCENC_SWZ_0] = 0.0f;
			
 
				+		data[ASTCENC_SWZ_1] = 1.0f;
			
 
				+
			
 
				+		for (int z = zd_start; z < padsize_z; z++)
			
 
				+		{
			
 
				+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
			
 
				+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
			
 
				+			float* data32 = static_cast<float*>(img->data[z_src]);
			
 
				+
			
 
				+			for (int y = 1; y < padsize_y; y++)
			
 
				+			{
			
 
				+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
			
 
				+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
			
 
				+
			
 
				+				for (int x = 1; x < padsize_x; x++)
			
 
				+				{
			
 
				+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
			
 
				+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
			
 
				+
			
 
				+					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
			
 
				+					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
			
 
				+					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
			
 
				+					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
			
 
				+
			
 
				+					float r = data[swz.r];
			
 
				+					float g = data[swz.g];
			
 
				+					float b = data[swz.b];
			
 
				+					float a = data[swz.a];
			
 
				+
			
 
				+					vfloat4 d(r, g, b, a);
			
 
				+
			
 
				+					VARBUF1(z, y, x) = d;
			
 
				+					VARBUF2(z, y, x) = d * d;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
			
 
				+	vfloat4 vbz = vfloat4::zero();
			
 
				+	for (int z = 0; z < padsize_z; z++)
			
 
				+	{
			
 
				+		for (int y = 0; y < padsize_y; y++)
			
 
				+		{
			
 
				+			VARBUF1(z, y, 0) = vbz;
			
 
				+			VARBUF2(z, y, 0) = vbz;
			
 
				+		}
			
 
				+
			
 
				+		for (int x = 0; x < padsize_x; x++)
			
 
				+		{
			
 
				+			VARBUF1(z, 0, x) = vbz;
			
 
				+			VARBUF2(z, 0, x) = vbz;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (have_z)
			
 
				+	{
			
 
				+		for (int y = 0; y < padsize_y; y++)
			
 
				+		{
			
 
				+			for (int x = 0; x < padsize_x; x++)
			
 
				+			{
			
 
				+				VARBUF1(0, y, x) = vbz;
			
 
				+				VARBUF2(0, y, x) = vbz;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Generate summed-area tables for N and N^2; this is done in-place, using
			
 
				+	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
			
 
				+	for (int z = zd_start; z < padsize_z; z++)
			
 
				+	{
			
 
				+		for (int y = 1; y < padsize_y; y++)
			
 
				+		{
			
 
				+			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
			
 
				+			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (int z = zd_start; z < padsize_z; z++)
			
 
				+	{
			
 
				+		for (int x = 1; x < padsize_x; x++)
			
 
				+		{
			
 
				+			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
			
 
				+			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (have_z)
			
 
				+	{
			
 
				+		for (int y = 1; y < padsize_y; y++)
			
 
				+		{
			
 
				+			for (int x = 1; x < padsize_x; x++)
			
 
				+			{
			
 
				+				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
			
 
				+				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Compute a few constants used in the variance-calculation.
			
 
				+	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
			
 
				+	float alpha_rsamples;
			
 
				+
			
 
				+	if (have_z)
			
 
				+	{
			
 
				+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
			
 
				+	}
			
 
				+
			
 
				+	// Use the summed-area tables to compute variance for each neighborhood
			
 
				+	if (have_z)
			
 
				+	{
			
 
				+		for (int z = 0; z < size_z; z++)
			
 
				+		{
			
 
				+			int z_src = z + kernel_radius_z;
			
 
				+			int z_dst = z + offset_z;
			
 
				+			int z_low  = z_src - alpha_kernel_radius;
			
 
				+			int z_high = z_src + alpha_kernel_radius + 1;
			
 
				+
			
 
				+			for (int y = 0; y < size_y; y++)
			
 
				+			{
			
 
				+				int y_src = y + kernel_radius_xy;
			
 
				+				int y_dst = y + offset_y;
			
 
				+				int y_low  = y_src - alpha_kernel_radius;
			
 
				+				int y_high = y_src + alpha_kernel_radius + 1;
			
 
				+
			
 
				+				for (int x = 0; x < size_x; x++)
			
 
				+				{
			
 
				+					int x_src = x + kernel_radius_xy;
			
 
				+					int x_dst = x + offset_x;
			
 
				+					int x_low  = x_src - alpha_kernel_radius;
			
 
				+					int x_high = x_src + alpha_kernel_radius + 1;
			
 
				+
			
 
				+					// Summed-area table lookups for alpha average
			
 
				+					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
			
 
				+					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
			
 
				+					               - VARBUF1(z_high, y_high, x_low).lane<3>()
			
 
				+					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
			
 
				+					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
			
 
				+					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
			
 
				+					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
			
 
				+					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
			
 
				+
			
 
				+					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
			
 
				+					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (int y = 0; y < size_y; y++)
			
 
				+		{
			
 
				+			int y_src = y + kernel_radius_xy;
			
 
				+			int y_dst = y + offset_y;
			
 
				+			int y_low  = y_src - alpha_kernel_radius;
			
 
				+			int y_high = y_src + alpha_kernel_radius + 1;
			
 
				+
			
 
				+			for (int x = 0; x < size_x; x++)
			
 
				+			{
			
 
				+				int x_src = x + kernel_radius_xy;
			
 
				+				int x_dst = x + offset_x;
			
 
				+				int x_low  = x_src - alpha_kernel_radius;
			
 
				+				int x_high = x_src + alpha_kernel_radius + 1;
			
 
				+
			
 
				+				// Summed-area table lookups for alpha average
			
 
				+				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
			
 
				+				            - VARBUF1(0, y_low,  x_high).lane<3>()
			
 
				+				            - VARBUF1(0, y_high, x_low).lane<3>()
			
 
				+				            + VARBUF1(0, y_high, x_high).lane<3>();
			
 
				+
			
 
				+				int out_index = y_dst * ydt + x_dst;
			
 
				+				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+unsigned int init_compute_averages(
			
 
				+	const astcenc_image& img,
			
 
				+	unsigned int alpha_kernel_radius,
			
 
				+	const astcenc_swizzle& swz,
			
 
				+	avg_args& ag
			
 
				+) {
			
 
				+	unsigned int size_x = img.dim_x;
			
 
				+	unsigned int size_y = img.dim_y;
			
 
				+	unsigned int size_z = img.dim_z;
			
 
				+
			
 
				+	// Compute maximum block size and from that the working memory buffer size
			
 
				+	unsigned int kernel_radius = alpha_kernel_radius;
			
 
				+	unsigned int kerneldim = 2 * kernel_radius + 1;
			
 
				+
			
 
				+	bool have_z = (size_z > 1);
			
 
				+	unsigned int max_blk_size_xy = have_z ? 16 : 32;
			
 
				+	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
			
 
				+
			
 
				+	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
			
 
				+	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
			
 
				+
			
 
				+	// Perform block-wise averages calculations across the image
			
 
				+	// Initialize fields which are not populated until later
			
 
				+	ag.arg.size_x = 0;
			
 
				+	ag.arg.size_y = 0;
			
 
				+	ag.arg.size_z = 0;
			
 
				+	ag.arg.offset_x = 0;
			
 
				+	ag.arg.offset_y = 0;
			
 
				+	ag.arg.offset_z = 0;
			
 
				+	ag.arg.work_memory = nullptr;
			
 
				+
			
 
				+	ag.arg.img = &img;
			
 
				+	ag.arg.swz = swz;
			
 
				+	ag.arg.have_z = have_z;
			
 
				+	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
			
 
				+
			
 
				+	ag.img_size_x = size_x;
			
 
				+	ag.img_size_y = size_y;
			
 
				+	ag.img_size_z = size_z;
			
 
				+	ag.blk_size_xy = max_blk_size_xy;
			
 
				+	ag.blk_size_z = max_blk_size_z;
			
 
				+	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
			
 
				+
			
 
				+	// The parallel task count
			
 
				+	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
			
 
				+	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
			
 
				+	return z_tasks * y_tasks;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
@@ -0,0 +1,623 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions to decompress a symbolic block.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the integer linear interpolation of two color endpoints.
			
 
				+ *
			
 
				+ * @param decode_mode   The ASTC profile (linear or sRGB)
			
 
				+ * @param color0        The endpoint0 color.
			
 
				+ * @param color1        The endpoint1 color.
			
 
				+ * @param weights        The interpolation weight (between 0 and 64).
			
 
				+ *
			
 
				+ * @return The interpolated color.
			
 
				+ */
			
 
				+static vint4 lerp_color_int(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	vint4 color0,
			
 
				+	vint4 color1,
			
 
				+	vint4 weights
			
 
				+) {
			
 
				+	vint4 weight1 = weights;
			
 
				+	vint4 weight0 = vint4(64) - weight1;
			
 
				+
			
 
				+	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				+	{
			
 
				+		color0 = asr<8>(color0);
			
 
				+		color1 = asr<8>(color1);
			
 
				+	}
			
 
				+
			
 
				+	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
			
 
				+	color = asr<6>(color);
			
 
				+
			
 
				+	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				+	{
			
 
				+		color = color * vint4(257);
			
 
				+	}
			
 
				+
			
 
				+	return color;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @brief Convert integer color value into a float value for the decoder.
			
 
				+ *
			
 
				+ * @param data       The integer color value post-interpolation.
			
 
				+ * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
			
 
				+ *
			
 
				+ * @return The float color value.
			
 
				+ */
			
 
				+static inline vfloat4 decode_texel(
			
 
				+	vint4 data,
			
 
				+	vmask4 lns_mask
			
 
				+) {
			
 
				+	vint4 color_lns = vint4::zero();
			
 
				+	vint4 color_unorm = vint4::zero();
			
 
				+
			
 
				+	if (any(lns_mask))
			
 
				+	{
			
 
				+		color_lns = lns_to_sf16(data);
			
 
				+	}
			
 
				+
			
 
				+	if (!all(lns_mask))
			
 
				+	{
			
 
				+		color_unorm = unorm16_to_sf16(data);
			
 
				+	}
			
 
				+
			
 
				+	// Pick components and then convert to FP16
			
 
				+	vint4 datai = select(color_unorm, color_lns, lns_mask);
			
 
				+	return float16_to_float(datai);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void unpack_weights(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	const decimation_info& di,
			
 
				+	bool is_dual_plane,
			
 
				+	int weights_plane1[BLOCK_MAX_TEXELS],
			
 
				+	int weights_plane2[BLOCK_MAX_TEXELS]
			
 
				+) {
			
 
				+	// Safe to overshoot as all arrays are allocated to full size
			
 
				+	if (!is_dual_plane)
			
 
				+	{
			
 
				+		// Build full 64-entry weight lookup table
			
 
				+		vint4 tab0(reinterpret_cast<const int*>(scb.weights +  0));
			
 
				+		vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));
			
 
				+		vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));
			
 
				+		vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));
			
 
				+
			
 
				+		vint tab0p, tab1p, tab2p, tab3p;
			
 
				+		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
			
 
				+
			
 
				+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint summed_value(8);
			
 
				+			vint weight_count(di.texel_weight_count + i);
			
 
				+			int max_weight_count = hmax(weight_count).lane<0>();
			
 
				+
			
 
				+			promise(max_weight_count > 0);
			
 
				+			for (int j = 0; j < max_weight_count; j++)
			
 
				+			{
			
 
				+				vint texel_weights(di.texel_weights_tr[j] + i);
			
 
				+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
			
 
				+
			
 
				+				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
			
 
				+			}
			
 
				+
			
 
				+			store(lsr<4>(summed_value), weights_plane1 + i);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// Build a 32-entry weight lookup table per plane
			
 
				+		// Plane 1
			
 
				+		vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights +  0));
			
 
				+		vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));
			
 
				+		vint tab0_plane1p, tab1_plane1p;
			
 
				+		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
			
 
				+
			
 
				+		// Plane 2
			
 
				+		vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));
			
 
				+		vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));
			
 
				+		vint tab0_plane2p, tab1_plane2p;
			
 
				+		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
			
 
				+
			
 
				+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vint sum_plane1(8);
			
 
				+			vint sum_plane2(8);
			
 
				+
			
 
				+			vint weight_count(di.texel_weight_count + i);
			
 
				+			int max_weight_count = hmax(weight_count).lane<0>();
			
 
				+
			
 
				+			promise(max_weight_count > 0);
			
 
				+			for (int j = 0; j < max_weight_count; j++)
			
 
				+			{
			
 
				+				vint texel_weights(di.texel_weights_tr[j] + i);
			
 
				+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
			
 
				+
			
 
				+				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
			
 
				+				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
			
 
				+			}
			
 
				+
			
 
				+			store(lsr<4>(sum_plane1), weights_plane1 + i);
			
 
				+			store(lsr<4>(sum_plane2), weights_plane2 + i);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return an FP32 NaN value for use in error colors.
			
 
				+ *
			
 
				+ * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
			
 
				+ *
			
 
				+ * @return The float color value.
			
 
				+ */
			
 
				+static float error_color_nan()
			
 
				+{
			
 
				+	if32 v;
			
 
				+	v.u = 0xFFFFE000U;
			
 
				+	return v.f;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void decompress_symbolic_block(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	int xpos,
			
 
				+	int ypos,
			
 
				+	int zpos,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	image_block& blk
			
 
				+) {
			
 
				+	blk.xpos = xpos;
			
 
				+	blk.ypos = ypos;
			
 
				+	blk.zpos = zpos;
			
 
				+
			
 
				+	blk.data_min = vfloat4::zero();
			
 
				+	blk.data_mean = vfloat4::zero();
			
 
				+	blk.data_max = vfloat4::zero();
			
 
				+	blk.grayscale = false;
			
 
				+
			
 
				+	// If we detected an error-block, blow up immediately.
			
 
				+	if (scb.block_type == SYM_BTYPE_ERROR)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < bsd.texel_count; i++)
			
 
				+		{
			
 
				+			blk.data_r[i] = error_color_nan();
			
 
				+			blk.data_g[i] = error_color_nan();
			
 
				+			blk.data_b[i] = error_color_nan();
			
 
				+			blk.data_a[i] = error_color_nan();
			
 
				+			blk.rgb_lns[i] = 0;
			
 
				+			blk.alpha_lns[i] = 0;
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
			
 
				+	    (scb.block_type == SYM_BTYPE_CONST_U16))
			
 
				+	{
			
 
				+		vfloat4 color;
			
 
				+		uint8_t use_lns = 0;
			
 
				+
			
 
				+		// UNORM16 constant color block
			
 
				+		if (scb.block_type == SYM_BTYPE_CONST_U16)
			
 
				+		{
			
 
				+			vint4 colori(scb.constant_color);
			
 
				+
			
 
				+			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
			
 
				+			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
			
 
				+			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				+			{
			
 
				+				colori = asr<8>(colori) * 257;
			
 
				+			}
			
 
				+
			
 
				+			vint4 colorf16 = unorm16_to_sf16(colori);
			
 
				+			color = float16_to_float(colorf16);
			
 
				+		}
			
 
				+		// FLOAT16 constant color block
			
 
				+		else
			
 
				+		{
			
 
				+			switch (decode_mode)
			
 
				+			{
			
 
				+			case ASTCENC_PRF_LDR_SRGB:
			
 
				+			case ASTCENC_PRF_LDR:
			
 
				+				color = vfloat4(error_color_nan());
			
 
				+				break;
			
 
				+			case ASTCENC_PRF_HDR_RGB_LDR_A:
			
 
				+			case ASTCENC_PRF_HDR:
			
 
				+				// Constant-color block; unpack from FP16 to FP32.
			
 
				+				color = float16_to_float(vint4(scb.constant_color));
			
 
				+				use_lns = 1;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int i = 0; i < bsd.texel_count; i++)
			
 
				+		{
			
 
				+			blk.data_r[i] = color.lane<0>();
			
 
				+			blk.data_g[i] = color.lane<1>();
			
 
				+			blk.data_b[i] = color.lane<2>();
			
 
				+			blk.data_a[i] = color.lane<3>();
			
 
				+			blk.rgb_lns[i] = use_lns;
			
 
				+			blk.alpha_lns[i] = use_lns;
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Get the appropriate partition-table entry
			
 
				+	int partition_count = scb.partition_count;
			
 
				+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
			
 
				+
			
 
				+	// Get the appropriate block descriptors
			
 
				+	const auto& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				+
			
 
				+	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
			
 
				+
			
 
				+	// Unquantize and undecimate the weights
			
 
				+	int plane1_weights[BLOCK_MAX_TEXELS];
			
 
				+	int plane2_weights[BLOCK_MAX_TEXELS];
			
 
				+	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
			
 
				+
			
 
				+	// Now that we have endpoint colors and weights, we can unpack texel colors
			
 
				+	int plane2_component = scb.plane2_component;
			
 
				+	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
			
 
				+
			
 
				+	for (int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		// Decode the color endpoints for this partition
			
 
				+		vint4 ep0;
			
 
				+		vint4 ep1;
			
 
				+		bool rgb_lns;
			
 
				+		bool a_lns;
			
 
				+
			
 
				+		unpack_color_endpoints(decode_mode,
			
 
				+		                       scb.color_formats[i],
			
 
				+		                       scb.color_values[i],
			
 
				+		                       rgb_lns, a_lns,
			
 
				+		                       ep0, ep1);
			
 
				+
			
 
				+		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
			
 
				+
			
 
				+		int texel_count = pi.partition_texel_count[i];
			
 
				+		for (int j = 0; j < texel_count; j++)
			
 
				+		{
			
 
				+			int tix = pi.texels_of_partition[i][j];
			
 
				+			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
			
 
				+			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
			
 
				+			vfloat4 colorf = decode_texel(color, lns_mask);
			
 
				+
			
 
				+			blk.data_r[tix] = colorf.lane<0>();
			
 
				+			blk.data_g[tix] = colorf.lane<1>();
			
 
				+			blk.data_b[tix] = colorf.lane<2>();
			
 
				+			blk.data_a[tix] = colorf.lane<3>();
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+float compute_symbolic_block_difference_2plane(
			
 
				+	const astcenc_config& config,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	const image_block& blk
			
 
				+) {
			
 
				+	// If we detected an error-block, blow up immediately.
			
 
				+	if (scb.block_type == SYM_BTYPE_ERROR)
			
 
				+	{
			
 
				+		return ERROR_CALC_DEFAULT;
			
 
				+	}
			
 
				+
			
 
				+	assert(scb.block_mode >= 0);
			
 
				+	assert(scb.partition_count == 1);
			
 
				+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
			
 
				+
			
 
				+	// Get the appropriate block descriptor
			
 
				+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				+
			
 
				+	// Unquantize and undecimate the weights
			
 
				+	int plane1_weights[BLOCK_MAX_TEXELS];
			
 
				+	int plane2_weights[BLOCK_MAX_TEXELS];
			
 
				+	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
			
 
				+
			
 
				+	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
			
 
				+
			
 
				+	vfloat4 summa = vfloat4::zero();
			
 
				+
			
 
				+	// Decode the color endpoints for this partition
			
 
				+	vint4 ep0;
			
 
				+	vint4 ep1;
			
 
				+	bool rgb_lns;
			
 
				+	bool a_lns;
			
 
				+
			
 
				+	unpack_color_endpoints(config.profile,
			
 
				+	                       scb.color_formats[0],
			
 
				+	                       scb.color_values[0],
			
 
				+	                       rgb_lns, a_lns,
			
 
				+	                       ep0, ep1);
			
 
				+
			
 
				+	// Unpack and compute error for each texel in the partition
			
 
				+	unsigned int texel_count = bsd.texel_count;
			
 
				+	for (unsigned int i = 0; i < texel_count; i++)
			
 
				+	{
			
 
				+		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
			
 
				+		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
			
 
				+
			
 
				+		vfloat4 color = int_to_float(colori);
			
 
				+		vfloat4 oldColor = blk.texel(i);
			
 
				+
			
 
				+		// Compare error using a perceptual decode metric for RGBM textures
			
 
				+		if (config.flags & ASTCENC_FLG_MAP_RGBM)
			
 
				+		{
			
 
				+			// Fail encodings that result in zero weight M pixels. Note that this can cause
			
 
				+			// "interesting" artifacts if we reject all useful encodings - we typically get max
			
 
				+			// brightness encodings instead which look just as bad. We recommend users apply a
			
 
				+			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
			
 
				+			// getting small M values post-quantization, but we can't prove it would never
			
 
				+			// happen, especially at low bit rates ...
			
 
				+			if (color.lane<3>() == 0.0f)
			
 
				+			{
			
 
				+				return -ERROR_CALC_DEFAULT;
			
 
				+			}
			
 
				+
			
 
				+			// Compute error based on decoded RGBM color
			
 
				+			color = vfloat4(
			
 
				+				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
			
 
				+				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
			
 
				+				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
			
 
				+				1.0f
			
 
				+			);
			
 
				+
			
 
				+			oldColor = vfloat4(
			
 
				+				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
			
 
				+				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
			
 
				+				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
			
 
				+				1.0f
			
 
				+			);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 error = oldColor - color;
			
 
				+		error = min(abs(error), 1e15f);
			
 
				+		error = error * error;
			
 
				+
			
 
				+		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
			
 
				+	}
			
 
				+
			
 
				+	return summa.lane<0>();
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+float compute_symbolic_block_difference_1plane(
			
 
				+	const astcenc_config& config,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	const image_block& blk
			
 
				+) {
			
 
				+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
			
 
				+
			
 
				+	// If we detected an error-block, blow up immediately.
			
 
				+	if (scb.block_type == SYM_BTYPE_ERROR)
			
 
				+	{
			
 
				+		return ERROR_CALC_DEFAULT;
			
 
				+	}
			
 
				+
			
 
				+	assert(scb.block_mode >= 0);
			
 
				+
			
 
				+	// Get the appropriate partition-table entry
			
 
				+	unsigned int partition_count = scb.partition_count;
			
 
				+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
			
 
				+
			
 
				+	// Get the appropriate block descriptor
			
 
				+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				+
			
 
				+	// Unquantize and undecimate the weights
			
 
				+	int plane1_weights[BLOCK_MAX_TEXELS];
			
 
				+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
			
 
				+
			
 
				+	vfloat4 summa = vfloat4::zero();
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		// Decode the color endpoints for this partition
			
 
				+		vint4 ep0;
			
 
				+		vint4 ep1;
			
 
				+		bool rgb_lns;
			
 
				+		bool a_lns;
			
 
				+
			
 
				+		unpack_color_endpoints(config.profile,
			
 
				+		                       scb.color_formats[i],
			
 
				+		                       scb.color_values[i],
			
 
				+		                       rgb_lns, a_lns,
			
 
				+		                       ep0, ep1);
			
 
				+
			
 
				+		// Unpack and compute error for each texel in the partition
			
 
				+		unsigned int texel_count = pi.partition_texel_count[i];
			
 
				+		for (unsigned int j = 0; j < texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
			
 
				+			                              vint4(plane1_weights[tix]));
			
 
				+
			
 
				+			vfloat4 color = int_to_float(colori);
			
 
				+			vfloat4 oldColor = blk.texel(tix);
			
 
				+
			
 
				+			// Compare error using a perceptual decode metric for RGBM textures
			
 
				+			if (config.flags & ASTCENC_FLG_MAP_RGBM)
			
 
				+			{
			
 
				+				// Fail encodings that result in zero weight M pixels. Note that this can cause
			
 
				+				// "interesting" artifacts if we reject all useful encodings - we typically get max
			
 
				+				// brightness encodings instead which look just as bad. We recommend users apply a
			
 
				+				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
			
 
				+				// getting small M values post-quantization, but we can't prove it would never
			
 
				+				// happen, especially at low bit rates ...
			
 
				+				if (color.lane<3>() == 0.0f)
			
 
				+				{
			
 
				+					return -ERROR_CALC_DEFAULT;
			
 
				+				}
			
 
				+
			
 
				+				// Compute error based on decoded RGBM color
			
 
				+				color = vfloat4(
			
 
				+					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
			
 
				+					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
			
 
				+					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
			
 
				+					1.0f
			
 
				+				);
			
 
				+
			
 
				+				oldColor = vfloat4(
			
 
				+					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
			
 
				+					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
			
 
				+					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
			
 
				+					1.0f
			
 
				+				);
			
 
				+			}
			
 
				+
			
 
				+			vfloat4 error = oldColor - color;
			
 
				+			error = min(abs(error), 1e15f);
			
 
				+			error = error * error;
			
 
				+
			
 
				+			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return summa.lane<0>();
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+float compute_symbolic_block_difference_1plane_1partition(
			
 
				+	const astcenc_config& config,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	const image_block& blk
			
 
				+) {
			
 
				+	// If we detected an error-block, blow up immediately.
			
 
				+	if (scb.block_type == SYM_BTYPE_ERROR)
			
 
				+	{
			
 
				+		return ERROR_CALC_DEFAULT;
			
 
				+	}
			
 
				+
			
 
				+	assert(scb.block_mode >= 0);
			
 
				+	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
			
 
				+
			
 
				+	// Get the appropriate block descriptor
			
 
				+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				+
			
 
				+	// Unquantize and undecimate the weights
			
 
				+	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
			
 
				+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
			
 
				+
			
 
				+	// Decode the color endpoints for this partition
			
 
				+	vint4 ep0;
			
 
				+	vint4 ep1;
			
 
				+	bool rgb_lns;
			
 
				+	bool a_lns;
			
 
				+
			
 
				+	unpack_color_endpoints(config.profile,
			
 
				+	                       scb.color_formats[0],
			
 
				+	                       scb.color_values[0],
			
 
				+	                       rgb_lns, a_lns,
			
 
				+	                       ep0, ep1);
			
 
				+
			
 
				+
			
 
				+	// Pre-shift sRGB so things round correctly
			
 
				+	if (config.profile == ASTCENC_PRF_LDR_SRGB)
			
 
				+	{
			
 
				+		ep0 = asr<8>(ep0);
			
 
				+		ep1 = asr<8>(ep1);
			
 
				+	}
			
 
				+
			
 
				+	// Unpack and compute error for each texel in the partition
			
 
				+	vfloatacc summav = vfloatacc::zero();
			
 
				+
			
 
				+	vint lane_id = vint::lane_id();
			
 
				+	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
			
 
				+
			
 
				+	unsigned int texel_count = bsd.texel_count;
			
 
				+	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		// Compute EP1 contribution
			
 
				+		vint weight1 = vint::loada(plane1_weights + i);
			
 
				+		vint ep1_r = vint(ep1.lane<0>()) * weight1;
			
 
				+		vint ep1_g = vint(ep1.lane<1>()) * weight1;
			
 
				+		vint ep1_b = vint(ep1.lane<2>()) * weight1;
			
 
				+		vint ep1_a = vint(ep1.lane<3>()) * weight1;
			
 
				+
			
 
				+		// Compute EP0 contribution
			
 
				+		vint weight0 = vint(64) - weight1;
			
 
				+		vint ep0_r = vint(ep0.lane<0>()) * weight0;
			
 
				+		vint ep0_g = vint(ep0.lane<1>()) * weight0;
			
 
				+		vint ep0_b = vint(ep0.lane<2>()) * weight0;
			
 
				+		vint ep0_a = vint(ep0.lane<3>()) * weight0;
			
 
				+
			
 
				+		// Shift so things round correctly
			
 
				+		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
			
 
				+		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
			
 
				+		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
			
 
				+		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
			
 
				+
			
 
				+		// Compute color diff
			
 
				+		vfloat color_r = int_to_float(colori_r);
			
 
				+		vfloat color_g = int_to_float(colori_g);
			
 
				+		vfloat color_b = int_to_float(colori_b);
			
 
				+		vfloat color_a = int_to_float(colori_a);
			
 
				+
			
 
				+		vfloat color_orig_r = loada(blk.data_r + i);
			
 
				+		vfloat color_orig_g = loada(blk.data_g + i);
			
 
				+		vfloat color_orig_b = loada(blk.data_b + i);
			
 
				+		vfloat color_orig_a = loada(blk.data_a + i);
			
 
				+
			
 
				+		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
			
 
				+		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
			
 
				+		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
			
 
				+		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
			
 
				+
			
 
				+		// Compute squared error metric
			
 
				+		color_error_r = color_error_r * color_error_r;
			
 
				+		color_error_g = color_error_g * color_error_g;
			
 
				+		color_error_b = color_error_b * color_error_b;
			
 
				+		color_error_a = color_error_a * color_error_a;
			
 
				+
			
 
				+		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
			
 
				+		              + color_error_g * blk.channel_weight.lane<1>()
			
 
				+		              + color_error_b * blk.channel_weight.lane<2>()
			
 
				+		              + color_error_a * blk.channel_weight.lane<3>();
			
 
				+
			
 
				+		// Mask off bad lanes
			
 
				+		vmask mask = lane_id < vint(texel_count);
			
 
				+		lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				+		haccumulate(summav, metric, mask);
			
 
				+	}
			
 
				+
			
 
				+	return hadd_s(summav);
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_diagnostic_trace.cpp
+++ b/thirdparty/astcenc/astcenc_diagnostic_trace.cpp
@@ -0,0 +1,230 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2021-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for the library entrypoint.
			
 
				+ */
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <cstdarg>
			
 
				+#include <cstdio>
			
 
				+#include <string>
			
 
				+
			
 
				+#include "astcenc_diagnostic_trace.h"
			
 
				+
			
 
				+/** @brief The global trace logger. */
			
 
				+static TraceLog* g_TraceLog = nullptr;
			
 
				+
			
 
				+/** @brief The JSON indentation level. */
			
 
				+static const size_t g_trace_indent = 2;
			
 
				+
			
 
				+TraceLog::TraceLog(
			
 
				+	const char* file_name):
			
 
				+	m_file(file_name, std::ofstream::out | std::ofstream::binary)
			
 
				+{
			
 
				+	assert(!g_TraceLog);
			
 
				+	g_TraceLog = this;
			
 
				+	m_root = new TraceNode("root");
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+TraceNode* TraceLog::get_current_leaf()
			
 
				+{
			
 
				+	if (m_stack.size())
			
 
				+	{
			
 
				+		return m_stack.back();
			
 
				+	}
			
 
				+
			
 
				+	return nullptr;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+size_t TraceLog::get_depth()
			
 
				+{
			
 
				+	return m_stack.size();
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+TraceLog::~TraceLog()
			
 
				+{
			
 
				+	assert(g_TraceLog == this);
			
 
				+	delete m_root;
			
 
				+	g_TraceLog = nullptr;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+TraceNode::TraceNode(
			
 
				+	const char* format,
			
 
				+	...
			
 
				+) {
			
 
				+	// Format the name string
			
 
				+	constexpr size_t bufsz = 256;
			
 
				+	char buffer[bufsz];
			
 
				+
			
 
				+	va_list args;
			
 
				+	va_start (args, format);
			
 
				+	vsnprintf (buffer, bufsz, format, args);
			
 
				+	va_end (args);
			
 
				+
			
 
				+	// Guarantee there is a nul terminator
			
 
				+	buffer[bufsz - 1] = 0;
			
 
				+
			
 
				+	// Generate the node
			
 
				+	TraceNode* parent = g_TraceLog->get_current_leaf();
			
 
				+	size_t depth = g_TraceLog->get_depth();
			
 
				+	g_TraceLog->m_stack.push_back(this);
			
 
				+
			
 
				+	bool comma = parent && parent->m_attrib_count;
			
 
				+	auto& out = g_TraceLog->m_file;
			
 
				+
			
 
				+	if (parent)
			
 
				+	{
			
 
				+		parent->m_attrib_count++;
			
 
				+	}
			
 
				+
			
 
				+	if (comma)
			
 
				+	{
			
 
				+		out << ',';
			
 
				+	}
			
 
				+
			
 
				+	if (depth)
			
 
				+	{
			
 
				+		out << '\n';
			
 
				+	}
			
 
				+
			
 
				+	size_t out_indent = (depth * 2) * g_trace_indent;
			
 
				+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
			
 
				+
			
 
				+	std::string out_indents("");
			
 
				+	if (out_indent)
			
 
				+	{
			
 
				+		out_indents = std::string(out_indent, ' ');
			
 
				+	}
			
 
				+
			
 
				+	std::string in_indents(in_indent, ' ');
			
 
				+
			
 
				+	out << out_indents << "[ \"node\", \"" << buffer << "\",\n";
			
 
				+	out << in_indents << "[";
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void TraceNode::add_attrib(
			
 
				+	std::string type,
			
 
				+	std::string key,
			
 
				+	std::string value
			
 
				+) {
			
 
				+	(void)type;
			
 
				+
			
 
				+	size_t depth = g_TraceLog->get_depth();
			
 
				+	size_t indent = (depth * 2) * g_trace_indent;
			
 
				+	auto& out = g_TraceLog->m_file;
			
 
				+	bool comma = m_attrib_count;
			
 
				+	m_attrib_count++;
			
 
				+
			
 
				+	if (comma)
			
 
				+	{
			
 
				+		out << ',';
			
 
				+	}
			
 
				+
			
 
				+	out << '\n';
			
 
				+	out << std::string(indent, ' ') << "[ "
			
 
				+	                                << "\"" << key << "\", "
			
 
				+	                                << value << " ]";
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+TraceNode::~TraceNode()
			
 
				+{
			
 
				+	g_TraceLog->m_stack.pop_back();
			
 
				+
			
 
				+	auto& out = g_TraceLog->m_file;
			
 
				+	size_t depth = g_TraceLog->get_depth();
			
 
				+	size_t out_indent = (depth * 2) * g_trace_indent;
			
 
				+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
			
 
				+
			
 
				+	std::string out_indents("");
			
 
				+	if (out_indent)
			
 
				+	{
			
 
				+		out_indents = std::string(out_indent, ' ');
			
 
				+	}
			
 
				+
			
 
				+	std::string in_indents(in_indent, ' ');
			
 
				+
			
 
				+	if (m_attrib_count)
			
 
				+	{
			
 
				+		out << "\n" << in_indents;
			
 
				+	}
			
 
				+	out << "]\n";
			
 
				+
			
 
				+	out << out_indents << "]";
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void trace_add_data(
			
 
				+	const char* key,
			
 
				+	const char* format,
			
 
				+	...
			
 
				+) {
			
 
				+	constexpr size_t bufsz = 256;
			
 
				+	char buffer[bufsz];
			
 
				+
			
 
				+	va_list args;
			
 
				+	va_start (args, format);
			
 
				+	vsnprintf (buffer, bufsz, format, args);
			
 
				+	va_end (args);
			
 
				+
			
 
				+	// Guarantee there is a nul terminator
			
 
				+	buffer[bufsz - 1] = 0;
			
 
				+
			
 
				+	std::string value = "\"" + std::string(buffer) + "\"";
			
 
				+
			
 
				+	TraceNode* node = g_TraceLog->get_current_leaf();
			
 
				+	node->add_attrib("str", key, value);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void trace_add_data(
			
 
				+	const char* key,
			
 
				+	float value
			
 
				+) {
			
 
				+  	char buffer[256];
			
 
				+	sprintf(buffer, "%.20g", (double)value);
			
 
				+	TraceNode* node = g_TraceLog->get_current_leaf();
			
 
				+	node->add_attrib("float", key, buffer);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void trace_add_data(
			
 
				+	const char* key,
			
 
				+	int value
			
 
				+) {
			
 
				+	TraceNode* node = g_TraceLog->get_current_leaf();
			
 
				+	node->add_attrib("int", key, std::to_string(value));
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void trace_add_data(
			
 
				+	const char* key,
			
 
				+	unsigned int value
			
 
				+) {
			
 
				+	TraceNode* node = g_TraceLog->get_current_leaf();
			
 
				+	node->add_attrib("int", key, std::to_string(value));
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_diagnostic_trace.h
+++ b/thirdparty/astcenc/astcenc_diagnostic_trace.h
@@ -0,0 +1,219 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2021-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief This module provides a set of diagnostic tracing utilities.
			
 
				+ *
			
 
				+ * Overview
			
 
				+ * ========
			
 
				+ *
			
 
				+ * The built-in diagnostic trace tool generates a hierarchical JSON tree structure. The tree
			
 
				+ * hierarchy contains three levels:
			
 
				+ *
			
 
				+ *    - block
			
 
				+ *        - pass
			
 
				+ *           - candidate
			
 
				+ *
			
 
				+ * One block node exists for each compressed block in the image. One pass node exists for each major
			
 
				+ * pass (N partition, M planes, O components) applied to a block. One candidate node exists for each
			
 
				+ * encoding candidate trialed for a pass.
			
 
				+ *
			
 
				+ * Each node contains both the hierarchy but also a number of attributes which explain the behavior.
			
 
				+ * For example, the block node contains the block coordinates in the image, the pass explains the
			
 
				+ * pass configuration, and the candidate will explain the candidate encoding such as weight
			
 
				+ * decimation, refinement error, etc.
			
 
				+ *
			
 
				+ * Trace Nodes are designed as scope-managed C++ objects with stack-like push/pop behavior.
			
 
				+ * Constructing a trace node on the stack will automatically add it to the current node as a child,
			
 
				+ * and then make it the current node. Destroying the current node will pop the stack and set the
			
 
				+ * parent to the current node. This provides a robust mechanism for ensuring reliable nesting in the
			
 
				+ * tree structure.
			
 
				+ *
			
 
				+ * A set of utility macros are provided to add attribute annotations to the current trace node.
			
 
				+ *
			
 
				+ * Usage
			
 
				+ * =====
			
 
				+ *
			
 
				+ * Create Trace Nodes on the stack using the @c TRACE_NODE() macro. This will compile-out completely
			
 
				+ * in builds with diagnostics disabled.
			
 
				+ *
			
 
				+ * Add annotations to the current trace node using the @c trace_add_data() macro. This will
			
 
				+ * similarly compile out completely in builds with diagnostics disabled.
			
 
				+ *
			
 
				+ * If you need to add additional code to support diagnostics-only behavior wrap
			
 
				+ * it in preprocessor guards:
			
 
				+ *
			
 
				+ *     #if defined(ASTCENC_DIAGNOSTICS)
			
 
				+ *     #endif
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
			
 
				+#define ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+
			
 
				+#include <iostream>
			
 
				+#include <fstream>
			
 
				+#include <vector>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Class representing a single node in the trace hierarchy.
			
 
				+ */
			
 
				+class TraceNode
			
 
				+{
			
 
				+public:
			
 
				+	/**
			
 
				+	 * @brief Construct a new node.
			
 
				+	 *
			
 
				+	 * Constructing a node will push to the the top of the stack, automatically making it a child of
			
 
				+	 * the current node, and then setting it to become the current node.
			
 
				+	 *
			
 
				+	 * @param format   The format template for the node name.
			
 
				+	 * @param ...      The format parameters.
			
 
				+	 */
			
 
				+	TraceNode(const char* format, ...);
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Add an attribute to this node.
			
 
				+	 *
			
 
				+	 * Note that no quoting is applied to the @c value, so if quoting is needed it must be done by
			
 
				+	 * the caller.
			
 
				+	 *
			
 
				+	 * @param type    The type of the attribute.
			
 
				+	 * @param key     The key of the attribute.
			
 
				+	 * @param value   The value of the attribute.
			
 
				+	 */
			
 
				+	void add_attrib(std::string type, std::string key, std::string value);
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Destroy this node.
			
 
				+	 *
			
 
				+	 * Destroying a node will pop it from the top of the stack, making its parent the current node.
			
 
				+	 * It is invalid behavior to destroy a node that is not the current node; usage must conform to
			
 
				+	 * stack push-pop semantics.
			
 
				+	 */
			
 
				+	~TraceNode();
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of attributes and child nodes in this node.
			
 
				+	 */
			
 
				+	unsigned int m_attrib_count { 0 };
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Class representing the trace log file being written.
			
 
				+ */
			
 
				+class TraceLog
			
 
				+{
			
 
				+public:
			
 
				+	/**
			
 
				+	 * @brief Create a new trace log.
			
 
				+	 *
			
 
				+	 * The trace log is global; there can be only one at a time.
			
 
				+	 *
			
 
				+	 * @param file_name   The name of the file to write.
			
 
				+	 */
			
 
				+	TraceLog(const char* file_name);
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Detroy the trace log.
			
 
				+	 *
			
 
				+	 * Trace logs MUST be cleanly destroyed to ensure the file gets written.
			
 
				+	 */
			
 
				+	~TraceLog();
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the current child node.
			
 
				+	 *
			
 
				+	 * @return The current leaf node.
			
 
				+	 */
			
 
				+	TraceNode* get_current_leaf();
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the stack depth of the current child node.
			
 
				+	 *
			
 
				+	 * @return The current leaf node stack depth.
			
 
				+	 */
			
 
				+	size_t get_depth();
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The file stream to write to.
			
 
				+	 */
			
 
				+	std::ofstream m_file;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The stack of nodes (newest at the back).
			
 
				+	 */
			
 
				+	std::vector<TraceNode*> m_stack;
			
 
				+
			
 
				+private:
			
 
				+	/**
			
 
				+	 * @brief The root node in the JSON file.
			
 
				+	 */
			
 
				+	TraceNode* m_root;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Utility macro to create a trace node on the stack.
			
 
				+ *
			
 
				+ * @param name     The variable name to use.
			
 
				+ * @param ...      The name template and format parameters.
			
 
				+ */
			
 
				+#define TRACE_NODE(name, ...) TraceNode name(__VA_ARGS__);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Add a string annotation to the current node.
			
 
				+ *
			
 
				+ * @param key      The name of the attribute.
			
 
				+ * @param format   The format template for the attribute value.
			
 
				+ * @param ...      The format parameters.
			
 
				+ */
			
 
				+void trace_add_data(const char* key, const char* format, ...);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Add a float annotation to the current node.
			
 
				+ *
			
 
				+ * @param key     The name of the attribute.
			
 
				+ * @param value   The value of the attribute.
			
 
				+ */
			
 
				+void trace_add_data(const char* key, float value);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Add an integer annotation to the current node.
			
 
				+ *
			
 
				+ * @param key     The name of the attribute.
			
 
				+ * @param value   The value of the attribute.
			
 
				+ */
			
 
				+void trace_add_data(const char* key, int value);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Add an unsigned integer annotation to the current node.
			
 
				+ *
			
 
				+ * @param key     The name of the attribute.
			
 
				+ * @param value   The value of the attribute.
			
 
				+ */
			
 
				+void trace_add_data(const char* key, unsigned int value);
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#define TRACE_NODE(name, ...)
			
 
				+
			
 
				+#define trace_add_data(...)
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_entry.cpp
+++ b/thirdparty/astcenc/astcenc_entry.cpp
@@ -0,0 +1,1427 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for the library entrypoint.
			
 
				+ */
			
 
				+
			
 
				+#include <array>
			
 
				+#include <cstring>
			
 
				+#include <new>
			
 
				+
			
 
				+#include "astcenc.h"
			
 
				+#include "astcenc_internal_entry.h"
			
 
				+#include "astcenc_diagnostic_trace.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Record of the quality tuning parameter values.
			
 
				+ *
			
 
				+ * See the @c astcenc_config structure for detailed parameter documentation.
			
 
				+ *
			
 
				+ * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
			
 
				+ * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
			
 
				+ * for the more through search presets because the underlying db_limit is so much higher.
			
 
				+ */
			
 
				+struct astcenc_preset_config
			
 
				+{
			
 
				+	float quality;
			
 
				+	unsigned int tune_partition_count_limit;
			
 
				+	unsigned int tune_2partition_index_limit;
			
 
				+	unsigned int tune_3partition_index_limit;
			
 
				+	unsigned int tune_4partition_index_limit;
			
 
				+	unsigned int tune_block_mode_limit;
			
 
				+	unsigned int tune_refinement_limit;
			
 
				+	unsigned int tune_candidate_limit;
			
 
				+	unsigned int tune_2partitioning_candidate_limit;
			
 
				+	unsigned int tune_3partitioning_candidate_limit;
			
 
				+	unsigned int tune_4partitioning_candidate_limit;
			
 
				+	float tune_db_limit_a_base;
			
 
				+	float tune_db_limit_b_base;
			
 
				+	float tune_mse_overshoot;
			
 
				+	float tune_2_partition_early_out_limit_factor;
			
 
				+	float tune_3_partition_early_out_limit_factor;
			
 
				+	float tune_2_plane_early_out_limit_correlation;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
			
 
				+ */
			
 
				+static const std::array<astcenc_preset_config, 6> preset_configs_high {{
			
 
				+	{
			
 
				+		ASTCENC_PRE_FASTEST,
			
 
				+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_FAST,
			
 
				+		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_MEDIUM,
			
 
				+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_THOROUGH,
			
 
				+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_VERYTHOROUGH,
			
 
				+		4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_EXHAUSTIVE,
			
 
				+		4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
			
 
				+	}
			
 
				+}};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
			
 
				+ */
			
 
				+static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
			
 
				+	{
			
 
				+		ASTCENC_PRE_FASTEST,
			
 
				+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_FAST,
			
 
				+		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_MEDIUM,
			
 
				+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_THOROUGH,
			
 
				+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_VERYTHOROUGH,
			
 
				+		4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_EXHAUSTIVE,
			
 
				+		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
			
 
				+	}
			
 
				+}};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
			
 
				+ */
			
 
				+static const std::array<astcenc_preset_config, 6> preset_configs_low {{
			
 
				+	{
			
 
				+		ASTCENC_PRE_FASTEST,
			
 
				+		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_FAST,
			
 
				+		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_MEDIUM,
			
 
				+		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_THOROUGH,
			
 
				+		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_VERYTHOROUGH,
			
 
				+		4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
			
 
				+	}, {
			
 
				+		ASTCENC_PRE_EXHAUSTIVE,
			
 
				+		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
			
 
				+	}
			
 
				+}};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate CPU floating point meets assumptions made in the codec.
			
 
				+ *
			
 
				+ * The codec is written with the assumption that a float threaded through the @c if32 union will be
			
 
				+ * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
			
 
				+ * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
			
 
				+ * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_cpu_float()
			
 
				+{
			
 
				+	if32 p;
			
 
				+	volatile float xprec_testval = 2.51f;
			
 
				+	p.f = xprec_testval + 12582912.0f;
			
 
				+	float q = p.f - 12582912.0f;
			
 
				+
			
 
				+	if (q != 3.0f)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_CPU_FLOAT;
			
 
				+	}
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate CPU ISA support meets the requirements of this build of the library.
			
 
				+ *
			
 
				+ * Each library build is statically compiled for a particular set of CPU ISA features, such as the
			
 
				+ * SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
			
 
				+ * actually supports everything this build needs.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_cpu_isa()
			
 
				+{
			
 
				+	#if ASTCENC_SSE >= 41
			
 
				+		if (!cpu_supports_sse41())
			
 
				+		{
			
 
				+			return ASTCENC_ERR_BAD_CPU_ISA;
			
 
				+		}
			
 
				+	#endif
			
 
				+
			
 
				+	#if ASTCENC_POPCNT >= 1
			
 
				+		if (!cpu_supports_popcnt())
			
 
				+		{
			
 
				+			return ASTCENC_ERR_BAD_CPU_ISA;
			
 
				+		}
			
 
				+	#endif
			
 
				+
			
 
				+	#if ASTCENC_F16C >= 1
			
 
				+		if (!cpu_supports_f16c())
			
 
				+		{
			
 
				+			return ASTCENC_ERR_BAD_CPU_ISA;
			
 
				+		}
			
 
				+	#endif
			
 
				+
			
 
				+	#if ASTCENC_AVX >= 2
			
 
				+		if (!cpu_supports_avx2())
			
 
				+		{
			
 
				+			return ASTCENC_ERR_BAD_CPU_ISA;
			
 
				+		}
			
 
				+	#endif
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate config profile.
			
 
				+ *
			
 
				+ * @param profile   The profile to check.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_profile(
			
 
				+	astcenc_profile profile
			
 
				+) {
			
 
				+	// Values in this enum are from an external user, so not guaranteed to be
			
 
				+	// bounded to the enum values
			
 
				+	switch (static_cast<int>(profile))
			
 
				+	{
			
 
				+	case ASTCENC_PRF_LDR_SRGB:
			
 
				+	case ASTCENC_PRF_LDR:
			
 
				+	case ASTCENC_PRF_HDR_RGB_LDR_A:
			
 
				+	case ASTCENC_PRF_HDR:
			
 
				+		return ASTCENC_SUCCESS;
			
 
				+	default:
			
 
				+		return ASTCENC_ERR_BAD_PROFILE;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate block size.
			
 
				+ *
			
 
				+ * @param block_x   The block x dimensions.
			
 
				+ * @param block_y   The block y dimensions.
			
 
				+ * @param block_z   The block z dimensions.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_block_size(
			
 
				+	unsigned int block_x,
			
 
				+	unsigned int block_y,
			
 
				+	unsigned int block_z
			
 
				+) {
			
 
				+	// Test if this is a legal block size at all
			
 
				+	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
			
 
				+	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
			
 
				+	if (!is_legal)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_BLOCK_SIZE;
			
 
				+	}
			
 
				+
			
 
				+	// Test if this build has sufficient capacity for this block size
			
 
				+	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
			
 
				+	if (!have_capacity)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_NOT_IMPLEMENTED;
			
 
				+	}
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate flags.
			
 
				+ *
			
 
				+ * @param flags   The flags to check.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_flags(
			
 
				+	unsigned int flags
			
 
				+) {
			
 
				+	// Flags field must not contain any unknown flag bits
			
 
				+	unsigned int exMask = ~ASTCENC_ALL_FLAGS;
			
 
				+	if (popcount(flags & exMask) != 0)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_FLAGS;
			
 
				+	}
			
 
				+
			
 
				+	// Flags field must only contain at most a single map type
			
 
				+	exMask = ASTCENC_FLG_MAP_NORMAL
			
 
				+	       | ASTCENC_FLG_MAP_RGBM;
			
 
				+	if (popcount(flags & exMask) > 1)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_FLAGS;
			
 
				+	}
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate single channel compression swizzle.
			
 
				+ *
			
 
				+ * @param swizzle   The swizzle to check.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_compression_swz(
			
 
				+	astcenc_swz swizzle
			
 
				+) {
			
 
				+	// Not all enum values are handled; SWZ_Z is invalid for compression
			
 
				+	switch (static_cast<int>(swizzle))
			
 
				+	{
			
 
				+	case ASTCENC_SWZ_R:
			
 
				+	case ASTCENC_SWZ_G:
			
 
				+	case ASTCENC_SWZ_B:
			
 
				+	case ASTCENC_SWZ_A:
			
 
				+	case ASTCENC_SWZ_0:
			
 
				+	case ASTCENC_SWZ_1:
			
 
				+		return ASTCENC_SUCCESS;
			
 
				+	default:
			
 
				+		return ASTCENC_ERR_BAD_SWIZZLE;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate overall compression swizzle.
			
 
				+ *
			
 
				+ * @param swizzle   The swizzle to check.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_compression_swizzle(
			
 
				+	const astcenc_swizzle& swizzle
			
 
				+) {
			
 
				+	if (validate_compression_swz(swizzle.r) ||
			
 
				+	    validate_compression_swz(swizzle.g) ||
			
 
				+	    validate_compression_swz(swizzle.b) ||
			
 
				+	    validate_compression_swz(swizzle.a))
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_SWIZZLE;
			
 
				+	}
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate single channel decompression swizzle.
			
 
				+ *
			
 
				+ * @param swizzle   The swizzle to check.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_decompression_swz(
			
 
				+	astcenc_swz swizzle
			
 
				+) {
			
 
				+	// Values in this enum are from an external user, so not guaranteed to be
			
 
				+	// bounded to the enum values
			
 
				+	switch (static_cast<int>(swizzle))
			
 
				+	{
			
 
				+	case ASTCENC_SWZ_R:
			
 
				+	case ASTCENC_SWZ_G:
			
 
				+	case ASTCENC_SWZ_B:
			
 
				+	case ASTCENC_SWZ_A:
			
 
				+	case ASTCENC_SWZ_0:
			
 
				+	case ASTCENC_SWZ_1:
			
 
				+	case ASTCENC_SWZ_Z:
			
 
				+		return ASTCENC_SUCCESS;
			
 
				+	default:
			
 
				+		return ASTCENC_ERR_BAD_SWIZZLE;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Validate overall decompression swizzle.
			
 
				+ *
			
 
				+ * @param swizzle   The swizzle to check.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_decompression_swizzle(
			
 
				+	const astcenc_swizzle& swizzle
			
 
				+) {
			
 
				+	if (validate_decompression_swz(swizzle.r) ||
			
 
				+	    validate_decompression_swz(swizzle.g) ||
			
 
				+	    validate_decompression_swz(swizzle.b) ||
			
 
				+	    validate_decompression_swz(swizzle.a))
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_SWIZZLE;
			
 
				+	}
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Validate that an incoming configuration is in-spec.
			
 
				+ *
			
 
				+ * This function can respond in two ways:
			
 
				+ *
			
 
				+ *   * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
			
 
				+ *     for out-of-range inputs in this case.
			
 
				+ *   * Numerical inputs and logic inputs are are logically invalid and which make no sense
			
 
				+ *     algorithmically will return an error.
			
 
				+ *
			
 
				+ * @param[in,out] config   The input compressor configuration.
			
 
				+ *
			
 
				+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				+ */
			
 
				+static astcenc_error validate_config(
			
 
				+	astcenc_config &config
			
 
				+) {
			
 
				+	astcenc_error status;
			
 
				+
			
 
				+	status = validate_profile(config.profile);
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	status = validate_flags(config.flags);
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	status = validate_block_size(config.block_x, config.block_y, config.block_z);
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+#if defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	// Decompress-only builds only support decompress-only contexts
			
 
				+	if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_PARAM;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
			
 
				+
			
 
				+	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
			
 
				+	config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
			
 
				+	config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
			
 
				+	config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
			
 
				+	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
			
 
				+	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
			
 
				+	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
			
 
				+	config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
			
 
				+	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
			
 
				+	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
			
 
				+	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
			
 
				+	config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
			
 
				+	config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
			
 
				+	config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
			
 
				+	config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
			
 
				+
			
 
				+	// Specifying a zero weight color component is not allowed; force to small value
			
 
				+	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
			
 
				+	                             astc::max(config.cw_b_weight, config.cw_a_weight));
			
 
				+	if (max_weight > 0.0f)
			
 
				+	{
			
 
				+		max_weight /= 1000.0f;
			
 
				+		config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
			
 
				+		config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
			
 
				+		config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
			
 
				+		config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
			
 
				+	}
			
 
				+	// If all color components error weights are zero then return an error
			
 
				+	else
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_PARAM;
			
 
				+	}
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+astcenc_error astcenc_config_init(
			
 
				+	astcenc_profile profile,
			
 
				+	unsigned int block_x,
			
 
				+	unsigned int block_y,
			
 
				+	unsigned int block_z,
			
 
				+	float quality,
			
 
				+	unsigned int flags,
			
 
				+	astcenc_config* configp
			
 
				+) {
			
 
				+	astcenc_error status;
			
 
				+
			
 
				+	// Check basic library compatibility options here so they are checked early. Note, these checks
			
 
				+	// are repeated in context_alloc for cases where callers use a manually defined config struct
			
 
				+	status = validate_cpu_isa();
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	status = validate_cpu_float();
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	// Zero init all config fields; although most of will be over written
			
 
				+	astcenc_config& config = *configp;
			
 
				+	std::memset(&config, 0, sizeof(config));
			
 
				+
			
 
				+	// Process the block size
			
 
				+	block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
			
 
				+	status = validate_block_size(block_x, block_y, block_z);
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	config.block_x = block_x;
			
 
				+	config.block_y = block_y;
			
 
				+	config.block_z = block_z;
			
 
				+
			
 
				+	float texels = static_cast<float>(block_x * block_y * block_z);
			
 
				+	float ltexels = logf(texels) / logf(10.0f);
			
 
				+
			
 
				+	// Process the performance quality level or preset; note that this must be done before we
			
 
				+	// process any additional settings, such as color profile and flags, which may replace some of
			
 
				+	// these settings with more use case tuned values
			
 
				+	if (quality < ASTCENC_PRE_FASTEST ||
			
 
				+	    quality > ASTCENC_PRE_EXHAUSTIVE)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_QUALITY;
			
 
				+	}
			
 
				+
			
 
				+	static const std::array<astcenc_preset_config, 6>* preset_configs;
			
 
				+	int texels_int = block_x * block_y * block_z;
			
 
				+	if (texels_int < 25)
			
 
				+	{
			
 
				+		preset_configs = &preset_configs_high;
			
 
				+	}
			
 
				+	else if (texels_int < 64)
			
 
				+	{
			
 
				+		preset_configs = &preset_configs_mid;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		preset_configs = &preset_configs_low;
			
 
				+	}
			
 
				+
			
 
				+	// Determine which preset to use, or which pair to interpolate
			
 
				+	size_t start;
			
 
				+	size_t end;
			
 
				+	for (end = 0; end < preset_configs->size(); end++)
			
 
				+	{
			
 
				+		if ((*preset_configs)[end].quality >= quality)
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	start = end == 0 ? 0 : end - 1;
			
 
				+
			
 
				+	// Start and end node are the same - so just transfer the values.
			
 
				+	if (start == end)
			
 
				+	{
			
 
				+		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
			
 
				+		config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
			
 
				+		config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
			
 
				+		config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
			
 
				+		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
			
 
				+		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
			
 
				+		config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
			
 
				+		config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
			
 
				+		config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
			
 
				+		config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
			
 
				+		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
			
 
				+		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
			
 
				+
			
 
				+		config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
			
 
				+
			
 
				+		config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
			
 
				+		config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
			
 
				+		config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
			
 
				+	}
			
 
				+	// Start and end node are not the same - so interpolate between them
			
 
				+	else
			
 
				+	{
			
 
				+		auto& node_a = (*preset_configs)[start];
			
 
				+		auto& node_b = (*preset_configs)[end];
			
 
				+
			
 
				+		float wt_range = node_b.quality - node_a.quality;
			
 
				+		assert(wt_range > 0);
			
 
				+
			
 
				+		// Compute interpolation factors
			
 
				+		float wt_node_a = (node_b.quality - quality) / wt_range;
			
 
				+		float wt_node_b = (quality - node_a.quality) / wt_range;
			
 
				+
			
 
				+		#define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
			
 
				+		#define LERPI(param) astc::flt2int_rtn(\
			
 
				+		                         (static_cast<float>(node_a.param) * wt_node_a) + \
			
 
				+		                         (static_cast<float>(node_b.param) * wt_node_b))
			
 
				+		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
			
 
				+
			
 
				+		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
			
 
				+		config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
			
 
				+		config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
			
 
				+		config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
			
 
				+		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
			
 
				+		config.tune_refinement_limit = LERPI(tune_refinement_limit);
			
 
				+		config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
			
 
				+		                                        TUNE_MAX_TRIAL_CANDIDATES);
			
 
				+		config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
			
 
				+		                                                      BLOCK_MAX_PARTITIONINGS);
			
 
				+		config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
			
 
				+		                                                      BLOCK_MAX_PARTITIONINGS);
			
 
				+		config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
			
 
				+		                                                      BLOCK_MAX_PARTITIONINGS);
			
 
				+		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
			
 
				+		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);
			
 
				+
			
 
				+		config.tune_mse_overshoot = LERP(tune_mse_overshoot);
			
 
				+
			
 
				+		config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
			
 
				+		config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
			
 
				+		config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
			
 
				+		#undef LERP
			
 
				+		#undef LERPI
			
 
				+		#undef LERPUI
			
 
				+	}
			
 
				+
			
 
				+	// Set heuristics to the defaults for each color profile
			
 
				+	config.cw_r_weight = 1.0f;
			
 
				+	config.cw_g_weight = 1.0f;
			
 
				+	config.cw_b_weight = 1.0f;
			
 
				+	config.cw_a_weight = 1.0f;
			
 
				+
			
 
				+	config.a_scale_radius = 0;
			
 
				+
			
 
				+	config.rgbm_m_scale = 0.0f;
			
 
				+
			
 
				+	config.profile = profile;
			
 
				+
			
 
				+	// Values in this enum are from an external user, so not guaranteed to be
			
 
				+	// bounded to the enum values
			
 
				+	switch (static_cast<int>(profile))
			
 
				+	{
			
 
				+	case ASTCENC_PRF_LDR:
			
 
				+	case ASTCENC_PRF_LDR_SRGB:
			
 
				+		break;
			
 
				+	case ASTCENC_PRF_HDR_RGB_LDR_A:
			
 
				+	case ASTCENC_PRF_HDR:
			
 
				+		config.tune_db_limit = 999.0f;
			
 
				+		break;
			
 
				+	default:
			
 
				+		return ASTCENC_ERR_BAD_PROFILE;
			
 
				+	}
			
 
				+
			
 
				+	// Flags field must not contain any unknown flag bits
			
 
				+	status = validate_flags(flags);
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	if (flags & ASTCENC_FLG_MAP_NORMAL)
			
 
				+	{
			
 
				+		// Normal map encoding uses L+A blocks, so allow one more partitioning
			
 
				+		// than normal. We need need fewer bits for endpoints, so more likely
			
 
				+		// to be able to use more partitions than an RGB/RGBA block
			
 
				+		config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
			
 
				+
			
 
				+		config.cw_g_weight = 0.0f;
			
 
				+		config.cw_b_weight = 0.0f;
			
 
				+		config.tune_2_partition_early_out_limit_factor *= 1.5f;
			
 
				+		config.tune_3_partition_early_out_limit_factor *= 1.5f;
			
 
				+		config.tune_2_plane_early_out_limit_correlation = 0.99f;
			
 
				+
			
 
				+		// Normals are prone to blocking artifacts on smooth curves
			
 
				+		// so force compressor to try harder here ...
			
 
				+		config.tune_db_limit *= 1.03f;
			
 
				+	}
			
 
				+	else if (flags & ASTCENC_FLG_MAP_RGBM)
			
 
				+	{
			
 
				+		config.rgbm_m_scale = 5.0f;
			
 
				+		config.cw_a_weight = 2.0f * config.rgbm_m_scale;
			
 
				+	}
			
 
				+	else // (This is color data)
			
 
				+	{
			
 
				+		// This is a very basic perceptual metric for RGB color data, which weights error
			
 
				+		// significance by the perceptual luminance contribution of each color channel. For
			
 
				+		// luminance the usual weights to compute luminance from a linear RGB value are as
			
 
				+		// follows:
			
 
				+		//
			
 
				+		//     l = r * 0.3 + g * 0.59 + b * 0.11
			
 
				+		//
			
 
				+		// ... but we scale these up to keep a better balance between color and alpha. Note
			
 
				+		// that if the content is using alpha we'd recommend using the -a option to weight
			
 
				+		// the color contribution by the alpha transparency.
			
 
				+		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
			
 
				+		{
			
 
				+			config.cw_r_weight = 0.30f * 2.25f;
			
 
				+			config.cw_g_weight = 0.59f * 2.25f;
			
 
				+			config.cw_b_weight = 0.11f * 2.25f;
			
 
				+		}
			
 
				+	}
			
 
				+	config.flags = flags;
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+astcenc_error astcenc_context_alloc(
			
 
				+	const astcenc_config* configp,
			
 
				+	unsigned int thread_count,
			
 
				+	astcenc_context** context
			
 
				+) {
			
 
				+	astcenc_error status;
			
 
				+	const astcenc_config& config = *configp;
			
 
				+
			
 
				+	status = validate_cpu_isa();
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	status = validate_cpu_float();
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	if (thread_count == 0)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_PARAM;
			
 
				+	}
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	// Force single threaded compressor use in diagnostic mode.
			
 
				+	if (thread_count != 1)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_PARAM;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	astcenc_context* ctxo = new astcenc_context;
			
 
				+	astcenc_contexti* ctx = &ctxo->context;
			
 
				+	ctx->thread_count = thread_count;
			
 
				+	ctx->config = config;
			
 
				+	ctx->working_buffers = nullptr;
			
 
				+
			
 
				+	// These are allocated per-compress, as they depend on image size
			
 
				+	ctx->input_alpha_averages = nullptr;
			
 
				+
			
 
				+	// Copy the config first and validate the copy (we may modify it)
			
 
				+	status = validate_config(ctx->config);
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		delete ctxo;
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
			
 
				+	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
			
 
				+	init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
			
 
				+	                           can_omit_modes,
			
 
				+	                           config.tune_partition_count_limit,
			
 
				+	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
			
 
				+	                           *ctx->bsd);
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	// Do setup only needed by compression
			
 
				+	if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
			
 
				+	{
			
 
				+		// Turn a dB limit into a per-texel error for faster use later
			
 
				+		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
			
 
				+		{
			
 
				+			ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			ctx->config.tune_db_limit = 0.0f;
			
 
				+		}
			
 
				+
			
 
				+		size_t worksize = sizeof(compression_working_buffers) * thread_count;
			
 
				+		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
			
 
				+		static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
			
 
				+		              "compression_working_buffers size must be multiple of vector alignment");
			
 
				+		if (!ctx->working_buffers)
			
 
				+		{
			
 
				+			aligned_free<block_size_descriptor>(ctx->bsd);
			
 
				+			delete ctxo;
			
 
				+			*context = nullptr;
			
 
				+			return ASTCENC_ERR_OUT_OF_MEM;
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
			
 
				+	if (!ctx->trace_log->m_file)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_DTRACE_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	trace_add_data("block_x", config.block_x);
			
 
				+	trace_add_data("block_y", config.block_y);
			
 
				+	trace_add_data("block_z", config.block_z);
			
 
				+#endif
			
 
				+
			
 
				+	*context = ctxo;
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	prepare_angular_tables();
			
 
				+#endif
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* See header dor documentation. */
			
 
				+void astcenc_context_free(
			
 
				+	astcenc_context* ctxo
			
 
				+) {
			
 
				+	if (ctxo)
			
 
				+	{
			
 
				+		astcenc_contexti* ctx = &ctxo->context;
			
 
				+		aligned_free<compression_working_buffers>(ctx->working_buffers);
			
 
				+		aligned_free<block_size_descriptor>(ctx->bsd);
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+		delete ctx->trace_log;
			
 
				+#endif
			
 
				+		delete ctxo;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compress an image, after any preflight has completed.
			
 
				+ *
			
 
				+ * @param[out] ctxo           The compressor context.
			
 
				+ * @param      thread_index   The thread index.
			
 
				+ * @param      image          The intput image.
			
 
				+ * @param      swizzle        The input swizzle.
			
 
				+ * @param[out] buffer         The output array for the compressed data.
			
 
				+ */
			
 
				+static void compress_image(
			
 
				+	astcenc_context& ctxo,
			
 
				+	unsigned int thread_index,
			
 
				+	const astcenc_image& image,
			
 
				+	const astcenc_swizzle& swizzle,
			
 
				+	uint8_t* buffer
			
 
				+) {
			
 
				+	astcenc_contexti& ctx = ctxo.context;
			
 
				+	const block_size_descriptor& bsd = *ctx.bsd;
			
 
				+	astcenc_profile decode_mode = ctx.config.profile;
			
 
				+
			
 
				+	image_block blk;
			
 
				+
			
 
				+	int block_x = bsd.xdim;
			
 
				+	int block_y = bsd.ydim;
			
 
				+	int block_z = bsd.zdim;
			
 
				+	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
			
 
				+
			
 
				+	int dim_x = image.dim_x;
			
 
				+	int dim_y = image.dim_y;
			
 
				+	int dim_z = image.dim_z;
			
 
				+
			
 
				+	int xblocks = (dim_x + block_x - 1) / block_x;
			
 
				+	int yblocks = (dim_y + block_y - 1) / block_y;
			
 
				+	int zblocks = (dim_z + block_z - 1) / block_z;
			
 
				+	int block_count = zblocks * yblocks * xblocks;
			
 
				+
			
 
				+	int row_blocks = xblocks;
			
 
				+	int plane_blocks = xblocks * yblocks;
			
 
				+
			
 
				+	// Populate the block channel weights
			
 
				+	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
			
 
				+	                             ctx.config.cw_g_weight,
			
 
				+	                             ctx.config.cw_b_weight,
			
 
				+	                             ctx.config.cw_a_weight);
			
 
				+
			
 
				+	// Use preallocated scratch buffer
			
 
				+	auto& temp_buffers = ctx.working_buffers[thread_index];
			
 
				+
			
 
				+	// Only the first thread actually runs the initializer
			
 
				+	ctxo.manage_compress.init(block_count);
			
 
				+
			
 
				+	// Determine if we can use an optimized load function
			
 
				+	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
			
 
				+	                 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
			
 
				+
			
 
				+	bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
			
 
				+	                 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
			
 
				+
			
 
				+	bool use_fast_load = !needs_swz && !needs_hdr &&
			
 
				+	                     block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
			
 
				+
			
 
				+	auto load_func = load_image_block;
			
 
				+	if (use_fast_load)
			
 
				+	{
			
 
				+		load_func = load_image_block_fast_ldr;
			
 
				+	}
			
 
				+
			
 
				+	// All threads run this processing loop until there is no work remaining
			
 
				+	while (true)
			
 
				+	{
			
 
				+		unsigned int count;
			
 
				+		unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
			
 
				+		if (!count)
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int i = base; i < base + count; i++)
			
 
				+		{
			
 
				+			// Decode i into x, y, z block indices
			
 
				+			int z = i / plane_blocks;
			
 
				+			unsigned int rem = i - (z * plane_blocks);
			
 
				+			int y = rem / row_blocks;
			
 
				+			int x = rem - (y * row_blocks);
			
 
				+
			
 
				+			// Test if we can apply some basic alpha-scale RDO
			
 
				+			bool use_full_block = true;
			
 
				+			if (ctx.config.a_scale_radius != 0 && block_z == 1)
			
 
				+			{
			
 
				+				int start_x = x * block_x;
			
 
				+				int end_x = astc::min(dim_x, start_x + block_x);
			
 
				+
			
 
				+				int start_y = y * block_y;
			
 
				+				int end_y = astc::min(dim_y, start_y + block_y);
			
 
				+
			
 
				+				// SATs accumulate error, so don't test exactly zero. Test for
			
 
				+				// less than 1 alpha in the expanded block footprint that
			
 
				+				// includes the alpha radius.
			
 
				+				int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
			
 
				+
			
 
				+				int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
			
 
				+
			
 
				+				float footprint = static_cast<float>(x_footprint * y_footprint);
			
 
				+				float threshold = 0.9f / (255.0f * footprint);
			
 
				+
			
 
				+				// Do we have any alpha values?
			
 
				+				use_full_block = false;
			
 
				+				for (int ay = start_y; ay < end_y; ay++)
			
 
				+				{
			
 
				+					for (int ax = start_x; ax < end_x; ax++)
			
 
				+					{
			
 
				+						float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
			
 
				+						if (a_avg > threshold)
			
 
				+						{
			
 
				+							use_full_block = true;
			
 
				+							ax = end_x;
			
 
				+							ay = end_y;
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			// Fetch the full block for compression
			
 
				+			if (use_full_block)
			
 
				+			{
			
 
				+				load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
			
 
				+
			
 
				+				// Scale RGB error contribution by the maximum alpha in the block
			
 
				+				// This encourages preserving alpha accuracy in regions with high
			
 
				+				// transparency, and can buy up to 0.5 dB PSNR.
			
 
				+				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
			
 
				+				{
			
 
				+					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
			
 
				+					blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
			
 
				+					                             ctx.config.cw_g_weight * alpha_scale,
			
 
				+					                             ctx.config.cw_b_weight * alpha_scale,
			
 
				+					                             ctx.config.cw_a_weight);
			
 
				+				}
			
 
				+			}
			
 
				+			// Apply alpha scale RDO - substitute constant color block
			
 
				+			else
			
 
				+			{
			
 
				+				blk.origin_texel = vfloat4::zero();
			
 
				+				blk.data_min = vfloat4::zero();
			
 
				+				blk.data_mean = vfloat4::zero();
			
 
				+				blk.data_max = vfloat4::zero();
			
 
				+				blk.grayscale = true;
			
 
				+			}
			
 
				+
			
 
				+			int offset = ((z * yblocks + y) * xblocks + x) * 16;
			
 
				+			uint8_t *bp = buffer + offset;
			
 
				+			physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
			
 
				+			compress_block(ctx, blk, *pcb, temp_buffers);
			
 
				+		}
			
 
				+
			
 
				+		ctxo.manage_compress.complete_task_assignment(count);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute regional averages in an image.
			
 
				+ *
			
 
				+ * This function can be called by multiple threads, but only after a single
			
 
				+ * thread calls the setup function @c init_compute_averages().
			
 
				+ *
			
 
				+ * Results are written back into @c img->input_alpha_averages.
			
 
				+ *
			
 
				+ * @param[out] ctx   The context.
			
 
				+ * @param      ag    The average and variance arguments created during setup.
			
 
				+ */
			
 
				+static void compute_averages(
			
 
				+	astcenc_context& ctx,
			
 
				+	const avg_args &ag
			
 
				+) {
			
 
				+	pixel_region_args arg = ag.arg;
			
 
				+	arg.work_memory = new vfloat4[ag.work_memory_size];
			
 
				+
			
 
				+	int size_x = ag.img_size_x;
			
 
				+	int size_y = ag.img_size_y;
			
 
				+	int size_z = ag.img_size_z;
			
 
				+
			
 
				+	int step_xy = ag.blk_size_xy;
			
 
				+	int step_z = ag.blk_size_z;
			
 
				+
			
 
				+	int y_tasks = (size_y + step_xy - 1) / step_xy;
			
 
				+
			
 
				+	// All threads run this processing loop until there is no work remaining
			
 
				+	while (true)
			
 
				+	{
			
 
				+		unsigned int count;
			
 
				+		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
			
 
				+		if (!count)
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int i = base; i < base + count; i++)
			
 
				+		{
			
 
				+			int z = (i / (y_tasks)) * step_z;
			
 
				+			int y = (i - (z * y_tasks)) * step_xy;
			
 
				+
			
 
				+			arg.size_z = astc::min(step_z, size_z - z);
			
 
				+			arg.offset_z = z;
			
 
				+
			
 
				+			arg.size_y = astc::min(step_xy, size_y - y);
			
 
				+			arg.offset_y = y;
			
 
				+
			
 
				+			for (int x = 0; x < size_x; x += step_xy)
			
 
				+			{
			
 
				+				arg.size_x = astc::min(step_xy, size_x - x);
			
 
				+				arg.offset_x = x;
			
 
				+				compute_pixel_region_variance(ctx.context, arg);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		ctx.manage_avg.complete_task_assignment(count);
			
 
				+	}
			
 
				+
			
 
				+	delete[] arg.work_memory;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+astcenc_error astcenc_compress_image(
			
 
				+	astcenc_context* ctxo,
			
 
				+	astcenc_image* imagep,
			
 
				+	const astcenc_swizzle* swizzle,
			
 
				+	uint8_t* data_out,
			
 
				+	size_t data_len,
			
 
				+	unsigned int thread_index
			
 
				+) {
			
 
				+#if defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	(void)ctxo;
			
 
				+	(void)imagep;
			
 
				+	(void)swizzle;
			
 
				+	(void)data_out;
			
 
				+	(void)data_len;
			
 
				+	(void)thread_index;
			
 
				+	return ASTCENC_ERR_BAD_CONTEXT;
			
 
				+#else
			
 
				+	astcenc_contexti* ctx = &ctxo->context;
			
 
				+	astcenc_error status;
			
 
				+	astcenc_image& image = *imagep;
			
 
				+
			
 
				+	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_CONTEXT;
			
 
				+	}
			
 
				+
			
 
				+	status = validate_compression_swizzle(*swizzle);
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	if (thread_index >= ctx->thread_count)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_PARAM;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int block_x = ctx->config.block_x;
			
 
				+	unsigned int block_y = ctx->config.block_y;
			
 
				+	unsigned int block_z = ctx->config.block_z;
			
 
				+
			
 
				+	unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
			
 
				+	unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
			
 
				+	unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
			
 
				+
			
 
				+	// Check we have enough output space (16 bytes per block)
			
 
				+	size_t size_needed = xblocks * yblocks * zblocks * 16;
			
 
				+	if (data_len < size_needed)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_OUT_OF_MEM;
			
 
				+	}
			
 
				+
			
 
				+	// If context thread count is one then implicitly reset
			
 
				+	if (ctx->thread_count == 1)
			
 
				+	{
			
 
				+		astcenc_compress_reset(ctxo);
			
 
				+	}
			
 
				+
			
 
				+	if (ctx->config.a_scale_radius != 0)
			
 
				+	{
			
 
				+		// First thread to enter will do setup, other threads will subsequently
			
 
				+		// enter the critical section but simply skip over the initialization
			
 
				+		auto init_avg = [ctx, &image, swizzle]() {
			
 
				+			// Perform memory allocations for the destination buffers
			
 
				+			size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
			
 
				+			ctx->input_alpha_averages = new float[texel_count];
			
 
				+
			
 
				+			return init_compute_averages(
			
 
				+				image, ctx->config.a_scale_radius, *swizzle,
			
 
				+				ctx->avg_preprocess_args);
			
 
				+		};
			
 
				+
			
 
				+		// Only the first thread actually runs the initializer
			
 
				+		ctxo->manage_avg.init(init_avg);
			
 
				+
			
 
				+		// All threads will enter this function and dynamically grab work
			
 
				+		compute_averages(*ctxo, ctx->avg_preprocess_args);
			
 
				+	}
			
 
				+
			
 
				+	// Wait for compute_averages to complete before compressing
			
 
				+	ctxo->manage_avg.wait();
			
 
				+
			
 
				+	compress_image(*ctxo, thread_index, image, *swizzle, data_out);
			
 
				+
			
 
				+	// Wait for compress to complete before freeing memory
			
 
				+	ctxo->manage_compress.wait();
			
 
				+
			
 
				+	auto term_compress = [ctx]() {
			
 
				+		delete[] ctx->input_alpha_averages;
			
 
				+		ctx->input_alpha_averages = nullptr;
			
 
				+	};
			
 
				+
			
 
				+	// Only the first thread to arrive actually runs the term
			
 
				+	ctxo->manage_compress.term(term_compress);
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+astcenc_error astcenc_compress_reset(
			
 
				+	astcenc_context* ctxo
			
 
				+) {
			
 
				+#if defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	(void)ctxo;
			
 
				+	return ASTCENC_ERR_BAD_CONTEXT;
			
 
				+#else
			
 
				+	astcenc_contexti* ctx = &ctxo->context;
			
 
				+	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_CONTEXT;
			
 
				+	}
			
 
				+
			
 
				+	ctxo->manage_avg.reset();
			
 
				+	ctxo->manage_compress.reset();
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+astcenc_error astcenc_decompress_image(
			
 
				+	astcenc_context* ctxo,
			
 
				+	const uint8_t* data,
			
 
				+	size_t data_len,
			
 
				+	astcenc_image* image_outp,
			
 
				+	const astcenc_swizzle* swizzle,
			
 
				+	unsigned int thread_index
			
 
				+) {
			
 
				+	astcenc_error status;
			
 
				+	astcenc_image& image_out = *image_outp;
			
 
				+	astcenc_contexti* ctx = &ctxo->context;
			
 
				+
			
 
				+	// Today this doesn't matter (working set on stack) but might in future ...
			
 
				+	if (thread_index >= ctx->thread_count)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_PARAM;
			
 
				+	}
			
 
				+
			
 
				+	status = validate_decompression_swizzle(*swizzle);
			
 
				+	if (status != ASTCENC_SUCCESS)
			
 
				+	{
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int block_x = ctx->config.block_x;
			
 
				+	unsigned int block_y = ctx->config.block_y;
			
 
				+	unsigned int block_z = ctx->config.block_z;
			
 
				+
			
 
				+	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
			
 
				+	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
			
 
				+	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
			
 
				+
			
 
				+	int row_blocks = xblocks;
			
 
				+	int plane_blocks = xblocks * yblocks;
			
 
				+
			
 
				+	// Check we have enough output space (16 bytes per block)
			
 
				+	size_t size_needed = xblocks * yblocks * zblocks * 16;
			
 
				+	if (data_len < size_needed)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_OUT_OF_MEM;
			
 
				+	}
			
 
				+
			
 
				+	image_block blk;
			
 
				+	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
			
 
				+
			
 
				+	// If context thread count is one then implicitly reset
			
 
				+	if (ctx->thread_count == 1)
			
 
				+	{
			
 
				+		astcenc_decompress_reset(ctxo);
			
 
				+	}
			
 
				+
			
 
				+	// Only the first thread actually runs the initializer
			
 
				+	ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
			
 
				+
			
 
				+	// All threads run this processing loop until there is no work remaining
			
 
				+	while (true)
			
 
				+	{
			
 
				+		unsigned int count;
			
 
				+		unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
			
 
				+		if (!count)
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int i = base; i < base + count; i++)
			
 
				+		{
			
 
				+			// Decode i into x, y, z block indices
			
 
				+			int z = i / plane_blocks;
			
 
				+			unsigned int rem = i - (z * plane_blocks);
			
 
				+			int y = rem / row_blocks;
			
 
				+			int x = rem - (y * row_blocks);
			
 
				+
			
 
				+			unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
			
 
				+			const uint8_t* bp = data + offset;
			
 
				+
			
 
				+			const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
			
 
				+			symbolic_compressed_block scb;
			
 
				+
			
 
				+			physical_to_symbolic(*ctx->bsd, pcb, scb);
			
 
				+
			
 
				+			decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
			
 
				+			                          x * block_x, y * block_y, z * block_z,
			
 
				+			                          scb, blk);
			
 
				+
			
 
				+			store_image_block(image_out, blk, *ctx->bsd,
			
 
				+			                  x * block_x, y * block_y, z * block_z, *swizzle);
			
 
				+		}
			
 
				+
			
 
				+		ctxo->manage_decompress.complete_task_assignment(count);
			
 
				+	}
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+astcenc_error astcenc_decompress_reset(
			
 
				+	astcenc_context* ctxo
			
 
				+) {
			
 
				+	ctxo->manage_decompress.reset();
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+astcenc_error astcenc_get_block_info(
			
 
				+	astcenc_context* ctxo,
			
 
				+	const uint8_t data[16],
			
 
				+	astcenc_block_info* info
			
 
				+) {
			
 
				+#if defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	(void)ctxo;
			
 
				+	(void)data;
			
 
				+	(void)info;
			
 
				+	return ASTCENC_ERR_BAD_CONTEXT;
			
 
				+#else
			
 
				+	astcenc_contexti* ctx = &ctxo->context;
			
 
				+
			
 
				+	// Decode the compressed data into a symbolic form
			
 
				+	const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
			
 
				+	symbolic_compressed_block scb;
			
 
				+	physical_to_symbolic(*ctx->bsd, pcb, scb);
			
 
				+
			
 
				+	// Fetch the appropriate partition and decimation tables
			
 
				+	block_size_descriptor& bsd = *ctx->bsd;
			
 
				+
			
 
				+	// Start from a clean slate
			
 
				+	memset(info, 0, sizeof(*info));
			
 
				+
			
 
				+	// Basic info we can always populate
			
 
				+	info->profile = ctx->config.profile;
			
 
				+
			
 
				+	info->block_x = ctx->config.block_x;
			
 
				+	info->block_y = ctx->config.block_y;
			
 
				+	info->block_z = ctx->config.block_z;
			
 
				+	info->texel_count = bsd.texel_count;
			
 
				+
			
 
				+	// Check for error blocks first
			
 
				+	info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
			
 
				+	if (info->is_error_block)
			
 
				+	{
			
 
				+		return ASTCENC_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	// Check for constant color blocks second
			
 
				+	info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
			
 
				+	                          scb.block_type == SYM_BTYPE_CONST_U16;
			
 
				+	if (info->is_constant_block)
			
 
				+	{
			
 
				+		return ASTCENC_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	// Otherwise handle a full block ; known to be valid after conditions above have been checked
			
 
				+	int partition_count = scb.partition_count;
			
 
				+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
			
 
				+
			
 
				+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				+
			
 
				+	info->weight_x = di.weight_x;
			
 
				+	info->weight_y = di.weight_y;
			
 
				+	info->weight_z = di.weight_z;
			
 
				+
			
 
				+	info->is_dual_plane_block = bm.is_dual_plane != 0;
			
 
				+
			
 
				+	info->partition_count = scb.partition_count;
			
 
				+	info->partition_index = scb.partition_index;
			
 
				+	info->dual_plane_component = scb.plane2_component;
			
 
				+
			
 
				+	info->color_level_count = get_quant_level(scb.get_color_quant_mode());
			
 
				+	info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
			
 
				+
			
 
				+	// Unpack color endpoints for each active partition
			
 
				+	for (unsigned int i = 0; i < scb.partition_count; i++)
			
 
				+	{
			
 
				+		bool rgb_hdr;
			
 
				+		bool a_hdr;
			
 
				+		vint4 endpnt[2];
			
 
				+
			
 
				+		unpack_color_endpoints(ctx->config.profile,
			
 
				+		                       scb.color_formats[i],
			
 
				+		                       scb.color_values[i],
			
 
				+		                       rgb_hdr, a_hdr,
			
 
				+		                       endpnt[0], endpnt[1]);
			
 
				+
			
 
				+		// Store the color endpoint mode info
			
 
				+		info->color_endpoint_modes[i] = scb.color_formats[i];
			
 
				+		info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
			
 
				+
			
 
				+		// Store the unpacked and decoded color endpoint
			
 
				+		vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
			
 
				+		for (int j = 0; j < 2; j++)
			
 
				+		{
			
 
				+			vint4 color_lns = lns_to_sf16(endpnt[j]);
			
 
				+			vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
			
 
				+			vint4 datai = select(color_unorm, color_lns, hdr_mask);
			
 
				+			store(float16_to_float(datai), info->color_endpoints[i][j]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Unpack weights for each texel
			
 
				+	int weight_plane1[BLOCK_MAX_TEXELS];
			
 
				+	int weight_plane2[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
			
 
				+	for (unsigned int i = 0; i < bsd.texel_count; i++)
			
 
				+	{
			
 
				+		info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
			
 
				+		if (info->is_dual_plane_block)
			
 
				+		{
			
 
				+			info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Unpack partition assignments for each texel
			
 
				+	for (unsigned int i = 0; i < bsd.texel_count; i++)
			
 
				+	{
			
 
				+		info->partition_assignment[i] = pi.partition_of_texel[i];
			
 
				+	}
			
 
				+
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+const char* astcenc_get_error_string(
			
 
				+	astcenc_error status
			
 
				+) {
			
 
				+	// Values in this enum are from an external user, so not guaranteed to be
			
 
				+	// bounded to the enum values
			
 
				+	switch (static_cast<int>(status))
			
 
				+	{
			
 
				+	case ASTCENC_SUCCESS:
			
 
				+		return "ASTCENC_SUCCESS";
			
 
				+	case ASTCENC_ERR_OUT_OF_MEM:
			
 
				+		return "ASTCENC_ERR_OUT_OF_MEM";
			
 
				+	case ASTCENC_ERR_BAD_CPU_FLOAT:
			
 
				+		return "ASTCENC_ERR_BAD_CPU_FLOAT";
			
 
				+	case ASTCENC_ERR_BAD_CPU_ISA:
			
 
				+		return "ASTCENC_ERR_BAD_CPU_ISA";
			
 
				+	case ASTCENC_ERR_BAD_PARAM:
			
 
				+		return "ASTCENC_ERR_BAD_PARAM";
			
 
				+	case ASTCENC_ERR_BAD_BLOCK_SIZE:
			
 
				+		return "ASTCENC_ERR_BAD_BLOCK_SIZE";
			
 
				+	case ASTCENC_ERR_BAD_PROFILE:
			
 
				+		return "ASTCENC_ERR_BAD_PROFILE";
			
 
				+	case ASTCENC_ERR_BAD_QUALITY:
			
 
				+		return "ASTCENC_ERR_BAD_QUALITY";
			
 
				+	case ASTCENC_ERR_BAD_FLAGS:
			
 
				+		return "ASTCENC_ERR_BAD_FLAGS";
			
 
				+	case ASTCENC_ERR_BAD_SWIZZLE:
			
 
				+		return "ASTCENC_ERR_BAD_SWIZZLE";
			
 
				+	case ASTCENC_ERR_BAD_CONTEXT:
			
 
				+		return "ASTCENC_ERR_BAD_CONTEXT";
			
 
				+	case ASTCENC_ERR_NOT_IMPLEMENTED:
			
 
				+		return "ASTCENC_ERR_NOT_IMPLEMENTED";
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	case ASTCENC_ERR_DTRACE_FAILURE:
			
 
				+		return "ASTCENC_ERR_DTRACE_FAILURE";
			
 
				+#endif
			
 
				+	default:
			
 
				+		return nullptr;
			
 
				+	}
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
+++ b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
@@ -0,0 +1,780 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for finding best partition for a block.
			
 
				+ *
			
 
				+ * The partition search operates in two stages. The first pass uses kmeans clustering to group
			
 
				+ * texels into an ideal partitioning for the requested partition count, and then compares that
			
 
				+ * against the 1024 partitionings generated by the ASTC partition hash function. The generated
			
 
				+ * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
			
 
				+ * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
			
 
				+ * partitionings that actually generate fewer than the requested partition count, but only the top
			
 
				+ * N candidates are actually put through a more detailed search. N is determined by the compressor
			
 
				+ * quality preset.
			
 
				+ *
			
 
				+ * For the detailed search, each candidate is checked against two possible encoding methods:
			
 
				+ *
			
 
				+ *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
			
 
				+ *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
			
 
				+ *
			
 
				+ * This is implemented by computing the compute mean color and dominant direction for each
			
 
				+ * partition. This defines two lines, both of which go through the mean color value.
			
 
				+ *
			
 
				+ * - One line has a direction defined by the dominant direction; this is used to assess the error
			
 
				+ *   from using an uncorrelated color representation.
			
 
				+ * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
			
 
				+ *   (RGB + scale) color representation.
			
 
				+ *
			
 
				+ * The best candidate is selected by computing the squared-errors that result from using these
			
 
				+ * lines for endpoint selection.
			
 
				+ */
			
 
				+
			
 
				+#include <limits>
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Pick some initial kmeans cluster centers.
			
 
				+ *
			
 
				+ * @param      blk               The image block color data to compress.
			
 
				+ * @param      texel_count       The number of texels in the block.
			
 
				+ * @param      partition_count   The number of partitions in the block.
			
 
				+ * @param[out] cluster_centers   The initial partition cluster center colors.
			
 
				+ */
			
 
				+static void kmeans_init(
			
 
				+	const image_block& blk,
			
 
				+	unsigned int texel_count,
			
 
				+	unsigned int partition_count,
			
 
				+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
			
 
				+) {
			
 
				+	promise(texel_count > 0);
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	unsigned int clusters_selected = 0;
			
 
				+	float distances[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	// Pick a random sample as first cluster center; 145897 from random.org
			
 
				+	unsigned int sample = 145897 % texel_count;
			
 
				+	vfloat4 center_color = blk.texel(sample);
			
 
				+	cluster_centers[clusters_selected] = center_color;
			
 
				+	clusters_selected++;
			
 
				+
			
 
				+	// Compute the distance to the first cluster center
			
 
				+	float distance_sum = 0.0f;
			
 
				+	for (unsigned int i = 0; i < texel_count; i++)
			
 
				+	{
			
 
				+		vfloat4 color = blk.texel(i);
			
 
				+		vfloat4 diff = color - center_color;
			
 
				+		float distance = dot_s(diff * diff, blk.channel_weight);
			
 
				+		distance_sum += distance;
			
 
				+		distances[i] = distance;
			
 
				+	}
			
 
				+
			
 
				+	// More numbers from random.org for weighted-random center selection
			
 
				+	const float cluster_cutoffs[9] {
			
 
				+		0.626220f, 0.932770f, 0.275454f,
			
 
				+		0.318558f, 0.240113f, 0.009190f,
			
 
				+		0.347661f, 0.731960f, 0.156391f
			
 
				+	};
			
 
				+
			
 
				+	unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
			
 
				+
			
 
				+	// Pick the remaining samples as needed
			
 
				+	while (true)
			
 
				+	{
			
 
				+		// Pick the next center in a weighted-random fashion.
			
 
				+		float summa = 0.0f;
			
 
				+		float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
			
 
				+		for (sample = 0; sample < texel_count; sample++)
			
 
				+		{
			
 
				+			summa += distances[sample];
			
 
				+			if (summa >= distance_cutoff)
			
 
				+			{
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Clamp to a valid range and store the selected cluster center
			
 
				+		sample = astc::min(sample, texel_count - 1);
			
 
				+
			
 
				+		center_color = blk.texel(sample);
			
 
				+		cluster_centers[clusters_selected++] = center_color;
			
 
				+		if (clusters_selected >= partition_count)
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		// Compute the distance to the new cluster center, keep the min dist
			
 
				+		distance_sum = 0.0f;
			
 
				+		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		{
			
 
				+			vfloat4 color = blk.texel(i);
			
 
				+			vfloat4 diff = color - center_color;
			
 
				+			float distance = dot_s(diff * diff, blk.channel_weight);
			
 
				+			distance = astc::min(distance, distances[i]);
			
 
				+			distance_sum += distance;
			
 
				+			distances[i] = distance;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Assign texels to clusters, based on a set of chosen center points.
			
 
				+ *
			
 
				+ * @param      blk                  The image block color data to compress.
			
 
				+ * @param      texel_count          The number of texels in the block.
			
 
				+ * @param      partition_count      The number of partitions in the block.
			
 
				+ * @param      cluster_centers      The partition cluster center colors.
			
 
				+ * @param[out] partition_of_texel   The partition assigned for each texel.
			
 
				+ */
			
 
				+static void kmeans_assign(
			
 
				+	const image_block& blk,
			
 
				+	unsigned int texel_count,
			
 
				+	unsigned int partition_count,
			
 
				+	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
			
 
				+	uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
			
 
				+) {
			
 
				+	promise(texel_count > 0);
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
			
 
				+
			
 
				+	// Find the best partition for every texel
			
 
				+	for (unsigned int i = 0; i < texel_count; i++)
			
 
				+	{
			
 
				+		float best_distance = std::numeric_limits<float>::max();
			
 
				+		unsigned int best_partition = 0;
			
 
				+
			
 
				+		vfloat4 color = blk.texel(i);
			
 
				+		for (unsigned int j = 0; j < partition_count; j++)
			
 
				+		{
			
 
				+			vfloat4 diff = color - cluster_centers[j];
			
 
				+			float distance = dot_s(diff * diff, blk.channel_weight);
			
 
				+			if (distance < best_distance)
			
 
				+			{
			
 
				+				best_distance = distance;
			
 
				+				best_partition = j;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		partition_of_texel[i] = static_cast<uint8_t>(best_partition);
			
 
				+		partition_texel_count[best_partition]++;
			
 
				+	}
			
 
				+
			
 
				+	// It is possible to get a situation where a partition ends up without any texels. In this case,
			
 
				+	// assign texel N to partition N. This is silly, but ensures that every partition retains at
			
 
				+	// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
			
 
				+	// so if we actually did a reassignment, run the whole loop over again.
			
 
				+	bool problem_case;
			
 
				+	do
			
 
				+	{
			
 
				+		problem_case = false;
			
 
				+		for (unsigned int i = 0; i < partition_count; i++)
			
 
				+		{
			
 
				+			if (partition_texel_count[i] == 0)
			
 
				+			{
			
 
				+				partition_texel_count[partition_of_texel[i]]--;
			
 
				+				partition_texel_count[i]++;
			
 
				+				partition_of_texel[i] = static_cast<uint8_t>(i);
			
 
				+				problem_case = true;
			
 
				+			}
			
 
				+		}
			
 
				+	} while (problem_case);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute new cluster centers based on their center of gravity.
			
 
				+ *
			
 
				+ * @param       blk                  The image block color data to compress.
			
 
				+ * @param       texel_count          The number of texels in the block.
			
 
				+ * @param       partition_count      The number of partitions in the block.
			
 
				+ * @param[out]  cluster_centers      The new cluster center colors.
			
 
				+ * @param       partition_of_texel   The partition assigned for each texel.
			
 
				+ */
			
 
				+static void kmeans_update(
			
 
				+	const image_block& blk,
			
 
				+	unsigned int texel_count,
			
 
				+	unsigned int partition_count,
			
 
				+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
			
 
				+	const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
			
 
				+) {
			
 
				+	promise(texel_count > 0);
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
			
 
				+		vfloat4::zero(),
			
 
				+		vfloat4::zero(),
			
 
				+		vfloat4::zero(),
			
 
				+		vfloat4::zero()
			
 
				+	};
			
 
				+
			
 
				+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
			
 
				+
			
 
				+	// Find the center-of-gravity in each cluster
			
 
				+	for (unsigned int i = 0; i < texel_count; i++)
			
 
				+	{
			
 
				+		uint8_t partition = partition_of_texel[i];
			
 
				+		color_sum[partition] += blk.texel(i);
			
 
				+		partition_texel_count[partition]++;
			
 
				+	}
			
 
				+
			
 
				+	// Set the center of gravity to be the new cluster center
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
			
 
				+		cluster_centers[i] = color_sum[i] * scale;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute bit-mismatch for partitioning in 2-partition mode.
			
 
				+ *
			
 
				+ * @param a   The texel assignment bitvector for the block.
			
 
				+ * @param b   The texel assignment bitvector for the partition table.
			
 
				+ *
			
 
				+ * @return    The number of bit mismatches.
			
 
				+ */
			
 
				+static inline unsigned int partition_mismatch2(
			
 
				+	const uint64_t a[2],
			
 
				+	const uint64_t b[2]
			
 
				+) {
			
 
				+	int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
			
 
				+	int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
			
 
				+	return astc::min(v1, v2);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute bit-mismatch for partitioning in 3-partition mode.
			
 
				+ *
			
 
				+ * @param a   The texel assignment bitvector for the block.
			
 
				+ * @param b   The texel assignment bitvector for the partition table.
			
 
				+ *
			
 
				+ * @return    The number of bit mismatches.
			
 
				+ */
			
 
				+static inline unsigned int partition_mismatch3(
			
 
				+	const uint64_t a[3],
			
 
				+	const uint64_t b[3]
			
 
				+) {
			
 
				+	int p00 = popcount(a[0] ^ b[0]);
			
 
				+	int p01 = popcount(a[0] ^ b[1]);
			
 
				+	int p02 = popcount(a[0] ^ b[2]);
			
 
				+
			
 
				+	int p10 = popcount(a[1] ^ b[0]);
			
 
				+	int p11 = popcount(a[1] ^ b[1]);
			
 
				+	int p12 = popcount(a[1] ^ b[2]);
			
 
				+
			
 
				+	int p20 = popcount(a[2] ^ b[0]);
			
 
				+	int p21 = popcount(a[2] ^ b[1]);
			
 
				+	int p22 = popcount(a[2] ^ b[2]);
			
 
				+
			
 
				+	int s0 = p11 + p22;
			
 
				+	int s1 = p12 + p21;
			
 
				+	int v0 = astc::min(s0, s1) + p00;
			
 
				+
			
 
				+	int s2 = p10 + p22;
			
 
				+	int s3 = p12 + p20;
			
 
				+	int v1 = astc::min(s2, s3) + p01;
			
 
				+
			
 
				+	int s4 = p10 + p21;
			
 
				+	int s5 = p11 + p20;
			
 
				+	int v2 = astc::min(s4, s5) + p02;
			
 
				+
			
 
				+	return astc::min(v0, v1, v2);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute bit-mismatch for partitioning in 4-partition mode.
			
 
				+ *
			
 
				+ * @param a   The texel assignment bitvector for the block.
			
 
				+ * @param b   The texel assignment bitvector for the partition table.
			
 
				+ *
			
 
				+ * @return    The number of bit mismatches.
			
 
				+ */
			
 
				+static inline unsigned int partition_mismatch4(
			
 
				+	const uint64_t a[4],
			
 
				+	const uint64_t b[4]
			
 
				+) {
			
 
				+	int p00 = popcount(a[0] ^ b[0]);
			
 
				+	int p01 = popcount(a[0] ^ b[1]);
			
 
				+	int p02 = popcount(a[0] ^ b[2]);
			
 
				+	int p03 = popcount(a[0] ^ b[3]);
			
 
				+
			
 
				+	int p10 = popcount(a[1] ^ b[0]);
			
 
				+	int p11 = popcount(a[1] ^ b[1]);
			
 
				+	int p12 = popcount(a[1] ^ b[2]);
			
 
				+	int p13 = popcount(a[1] ^ b[3]);
			
 
				+
			
 
				+	int p20 = popcount(a[2] ^ b[0]);
			
 
				+	int p21 = popcount(a[2] ^ b[1]);
			
 
				+	int p22 = popcount(a[2] ^ b[2]);
			
 
				+	int p23 = popcount(a[2] ^ b[3]);
			
 
				+
			
 
				+	int p30 = popcount(a[3] ^ b[0]);
			
 
				+	int p31 = popcount(a[3] ^ b[1]);
			
 
				+	int p32 = popcount(a[3] ^ b[2]);
			
 
				+	int p33 = popcount(a[3] ^ b[3]);
			
 
				+
			
 
				+	int mx23 = astc::min(p22 + p33, p23 + p32);
			
 
				+	int mx13 = astc::min(p21 + p33, p23 + p31);
			
 
				+	int mx12 = astc::min(p21 + p32, p22 + p31);
			
 
				+	int mx03 = astc::min(p20 + p33, p23 + p30);
			
 
				+	int mx02 = astc::min(p20 + p32, p22 + p30);
			
 
				+	int mx01 = astc::min(p21 + p30, p20 + p31);
			
 
				+
			
 
				+	int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
			
 
				+	int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
			
 
				+	int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
			
 
				+	int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
			
 
				+
			
 
				+	return astc::min(v0, v1, v2, v3);
			
 
				+}
			
 
				+
			
 
				+using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Count the partition table mismatches vs the data clustering.
			
 
				+ *
			
 
				+ * @param      bsd               The block size information.
			
 
				+ * @param      partition_count   The number of partitions in the block.
			
 
				+ * @param      bitmaps           The block texel partition assignment patterns.
			
 
				+ * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
			
 
				+ */
			
 
				+static void count_partition_mismatch_bits(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	unsigned int partition_count,
			
 
				+	const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
			
 
				+	unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
			
 
				+) {
			
 
				+	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
			
 
				+	promise(active_count > 0);
			
 
				+
			
 
				+	if (partition_count == 2)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < active_count; i++)
			
 
				+		{
			
 
				+			mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
			
 
				+		}
			
 
				+	}
			
 
				+	else if (partition_count == 3)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < active_count; i++)
			
 
				+		{
			
 
				+			mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < active_count; i++)
			
 
				+		{
			
 
				+			mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Use counting sort on the mismatch array to sort partition candidates.
			
 
				+ *
			
 
				+ * @param      partitioning_count   The number of packed partitionings.
			
 
				+ * @param      mismatch_count       Partitioning mismatch counts, in index order.
			
 
				+ * @param[out] partition_ordering   Partition index values, in mismatch order.
			
 
				+ *
			
 
				+ * @return The number of active partitions in this selection.
			
 
				+ */
			
 
				+static unsigned int get_partition_ordering_by_mismatch_bits(
			
 
				+	unsigned int partitioning_count,
			
 
				+	const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
			
 
				+	unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
			
 
				+) {
			
 
				+	promise(partitioning_count > 0);
			
 
				+	unsigned int mscount[256] { 0 };
			
 
				+
			
 
				+	// Create the histogram of mismatch counts
			
 
				+	for (unsigned int i = 0; i < partitioning_count; i++)
			
 
				+	{
			
 
				+		mscount[mismatch_count[i]]++;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int active_count = partitioning_count - mscount[255];
			
 
				+
			
 
				+	// Create a running sum from the histogram array
			
 
				+	// Cells store previous values only; i.e. exclude self after sum
			
 
				+	unsigned int summa = 0;
			
 
				+	for (unsigned int i = 0; i < 256; i++)
			
 
				+	{
			
 
				+		unsigned int cnt = mscount[i];
			
 
				+		mscount[i] = summa;
			
 
				+		summa += cnt;
			
 
				+	}
			
 
				+
			
 
				+	// Use the running sum as the index, incrementing after read to allow
			
 
				+	// sequential entries with the same count
			
 
				+	for (unsigned int i = 0; i < partitioning_count; i++)
			
 
				+	{
			
 
				+		unsigned int idx = mscount[mismatch_count[i]]++;
			
 
				+		partition_ordering[idx] = i;
			
 
				+	}
			
 
				+
			
 
				+	return active_count;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Use k-means clustering to compute a partition ordering for a block..
			
 
				+ *
			
 
				+ * @param      bsd                  The block size information.
			
 
				+ * @param      blk                  The image block color data to compress.
			
 
				+ * @param      partition_count      The desired number of partitions in the block.
			
 
				+ * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
			
 
				+ *
			
 
				+ * @return The number of active partitionings in this selection.
			
 
				+ */
			
 
				+static unsigned int compute_kmeans_partition_ordering(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int partition_count,
			
 
				+	unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
			
 
				+) {
			
 
				+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
			
 
				+	uint8_t texel_partitions[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	// Use three passes of k-means clustering to partition the block data
			
 
				+	for (unsigned int i = 0; i < 3; i++)
			
 
				+	{
			
 
				+		if (i == 0)
			
 
				+		{
			
 
				+			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
			
 
				+		}
			
 
				+
			
 
				+		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
			
 
				+	}
			
 
				+
			
 
				+	// Construct the block bitmaps of texel assignments to each partition
			
 
				+	uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
			
 
				+	unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
			
 
				+	promise(texels_to_process > 0);
			
 
				+	for (unsigned int i = 0; i < texels_to_process; i++)
			
 
				+	{
			
 
				+		unsigned int idx = bsd.kmeans_texels[i];
			
 
				+		bitmaps[texel_partitions[idx]] |= 1ULL << i;
			
 
				+	}
			
 
				+
			
 
				+	// Count the mismatch between the block and the format's partition tables
			
 
				+	unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
			
 
				+	count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
			
 
				+
			
 
				+	// Sort the partitions based on the number of mismatched bits
			
 
				+	return get_partition_ordering_by_mismatch_bits(
			
 
				+	    bsd.partitioning_count_selected[partition_count - 1],
			
 
				+	    mismatch_counts, partition_ordering);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Insert a partitioning into an order list of results, sorted by error.
			
 
				+ *
			
 
				+ * @param      max_values      The max number of entries in the best result arrays.
			
 
				+ * @param      this_error      The error of the new entry.
			
 
				+ * @param      this_partition  The partition ID of the new entry.
			
 
				+ * @param[out] best_errors     The array of best error values.
			
 
				+ * @param[out] best_partitions The array of best partition values.
			
 
				+ */
			
 
				+static void insert_result(
			
 
				+	unsigned int max_values,
			
 
				+	float this_error,
			
 
				+	unsigned int this_partition,
			
 
				+	float* best_errors,
			
 
				+	unsigned int* best_partitions)
			
 
				+{
			
 
				+	promise(max_values > 0);
			
 
				+
			
 
				+	// Don't bother searching if the current worst error beats the new error
			
 
				+	if (this_error >= best_errors[max_values - 1])
			
 
				+	{
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Else insert into the list in error-order
			
 
				+	for (unsigned int i = 0; i < max_values; i++)
			
 
				+	{
			
 
				+		// Existing result is better - move on ...
			
 
				+		if (this_error > best_errors[i])
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		// Move existing results down one
			
 
				+		for (unsigned int j = max_values - 1; j > i; j--)
			
 
				+		{
			
 
				+			best_errors[j] = best_errors[j - 1];
			
 
				+			best_partitions[j] = best_partitions[j - 1];
			
 
				+		}
			
 
				+
			
 
				+		// Insert new result
			
 
				+		best_errors[i] = this_error;
			
 
				+		best_partitions[i] = this_partition;
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+unsigned int find_best_partition_candidates(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int partition_count,
			
 
				+	unsigned int partition_search_limit,
			
 
				+	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
			
 
				+	unsigned int requested_candidates
			
 
				+) {
			
 
				+	// Constant used to estimate quantization error for a given partitioning; the optimal value for
			
 
				+	// this depends on bitrate. These values have been determined empirically.
			
 
				+	unsigned int texels_per_block = bsd.texel_count;
			
 
				+	float weight_imprecision_estim = 0.055f;
			
 
				+	if (texels_per_block <= 20)
			
 
				+	{
			
 
				+		weight_imprecision_estim = 0.03f;
			
 
				+	}
			
 
				+	else if (texels_per_block <= 31)
			
 
				+	{
			
 
				+		weight_imprecision_estim = 0.04f;
			
 
				+	}
			
 
				+	else if (texels_per_block <= 41)
			
 
				+	{
			
 
				+		weight_imprecision_estim = 0.05f;
			
 
				+	}
			
 
				+
			
 
				+	promise(partition_count > 0);
			
 
				+	promise(partition_search_limit > 0);
			
 
				+
			
 
				+	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
			
 
				+
			
 
				+	unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
			
 
				+	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
			
 
				+	partition_search_limit = astc::min(partition_search_limit, sequence_len);
			
 
				+	requested_candidates = astc::min(partition_search_limit, requested_candidates);
			
 
				+
			
 
				+	bool uses_alpha = !blk.is_constant_channel(3);
			
 
				+
			
 
				+	// Partitioning errors assuming uncorrelated-chrominance endpoints
			
 
				+	float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
			
 
				+	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
			
 
				+
			
 
				+	// Partitioning errors assuming same-chrominance endpoints
			
 
				+	float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
			
 
				+	unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
			
 
				+
			
 
				+	for (unsigned int i = 0; i < requested_candidates; i++)
			
 
				+	{
			
 
				+		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
			
 
				+		samec_best_errors[i] = ERROR_CALC_DEFAULT;
			
 
				+	}
			
 
				+
			
 
				+	if (uses_alpha)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < partition_search_limit; i++)
			
 
				+		{
			
 
				+			unsigned int partition = partition_sequence[i];
			
 
				+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
			
 
				+
			
 
				+			// Compute weighting to give to each component in each partition
			
 
				+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+			compute_avgs_and_dirs_4_comp(pi, blk, pms);
			
 
				+
			
 
				+			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
			
 
				+			line4 samec_lines[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+			processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
			
 
				+			processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+			float uncor_line_lens[BLOCK_MAX_PARTITIONS];
			
 
				+			float samec_line_lens[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+			for (unsigned int j = 0; j < partition_count; j++)
			
 
				+			{
			
 
				+				partition_metrics& pm = pms[j];
			
 
				+
			
 
				+				uncor_lines[j].a = pm.avg;
			
 
				+				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
			
 
				+
			
 
				+				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
			
 
				+				uncor_plines[j].bs = uncor_lines[j].b;
			
 
				+
			
 
				+				samec_lines[j].a = vfloat4::zero();
			
 
				+				samec_lines[j].b = normalize_safe(pm.avg, unit4());
			
 
				+
			
 
				+				samec_plines[j].amod = vfloat4::zero();
			
 
				+				samec_plines[j].bs = samec_lines[j].b;
			
 
				+			}
			
 
				+
			
 
				+			float uncor_error = 0.0f;
			
 
				+			float samec_error = 0.0f;
			
 
				+
			
 
				+			compute_error_squared_rgba(pi,
			
 
				+			                           blk,
			
 
				+			                           uncor_plines,
			
 
				+			                           samec_plines,
			
 
				+			                           uncor_line_lens,
			
 
				+			                           samec_line_lens,
			
 
				+			                           uncor_error,
			
 
				+			                           samec_error);
			
 
				+
			
 
				+			// Compute an estimate of error introduced by weight quantization imprecision.
			
 
				+			// This error is computed as follows, for each partition
			
 
				+			//     1: compute the principal-axis vector (full length) in error-space
			
 
				+			//     2: convert the principal-axis vector to regular RGB-space
			
 
				+			//     3: scale the vector by a constant that estimates average quantization error
			
 
				+			//     4: for each texel, square the vector, then do a dot-product with the texel's
			
 
				+			//        error weight; sum up the results across all texels.
			
 
				+			//     4(optimized): square the vector once, then do a dot-product with the average
			
 
				+			//        texel error, then multiply by the number of texels.
			
 
				+
			
 
				+			for (unsigned int j = 0; j < partition_count; j++)
			
 
				+			{
			
 
				+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
			
 
				+				vfloat4 error_weights(tpp * weight_imprecision_estim);
			
 
				+
			
 
				+				vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j];
			
 
				+				vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j];
			
 
				+
			
 
				+				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
			
 
				+				samec_error += dot_s(samec_vector * samec_vector, error_weights);
			
 
				+			}
			
 
				+
			
 
				+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
			
 
				+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < partition_search_limit; i++)
			
 
				+		{
			
 
				+			unsigned int partition = partition_sequence[i];
			
 
				+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
			
 
				+
			
 
				+			// Compute weighting to give to each component in each partition
			
 
				+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
			
 
				+			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
			
 
				+
			
 
				+			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+			for (unsigned int j = 0; j < partition_count; j++)
			
 
				+			{
			
 
				+				partition_metrics& pm = pms[j];
			
 
				+				partition_lines3& pl = plines[j];
			
 
				+
			
 
				+				pl.uncor_line.a = pm.avg;
			
 
				+				pl.uncor_line.b = normalize_safe(pm.dir, unit3());
			
 
				+
			
 
				+				pl.samec_line.a = vfloat4::zero();
			
 
				+				pl.samec_line.b = normalize_safe(pm.avg, unit3());
			
 
				+
			
 
				+				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
			
 
				+				pl.uncor_pline.bs   = pl.uncor_line.b;
			
 
				+
			
 
				+				pl.samec_pline.amod = vfloat4::zero();
			
 
				+				pl.samec_pline.bs   = pl.samec_line.b;
			
 
				+			}
			
 
				+
			
 
				+			float uncor_error = 0.0f;
			
 
				+			float samec_error = 0.0f;
			
 
				+
			
 
				+			compute_error_squared_rgb(pi,
			
 
				+			                          blk,
			
 
				+			                          plines,
			
 
				+			                          uncor_error,
			
 
				+			                          samec_error);
			
 
				+
			
 
				+			// Compute an estimate of error introduced by weight quantization imprecision.
			
 
				+			// This error is computed as follows, for each partition
			
 
				+			//     1: compute the principal-axis vector (full length) in error-space
			
 
				+			//     2: convert the principal-axis vector to regular RGB-space
			
 
				+			//     3: scale the vector by a constant that estimates average quantization error
			
 
				+			//     4: for each texel, square the vector, then do a dot-product with the texel's
			
 
				+			//        error weight; sum up the results across all texels.
			
 
				+			//     4(optimized): square the vector once, then do a dot-product with the average
			
 
				+			//        texel error, then multiply by the number of texels.
			
 
				+
			
 
				+			for (unsigned int j = 0; j < partition_count; j++)
			
 
				+			{
			
 
				+				partition_lines3& pl = plines[j];
			
 
				+
			
 
				+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
			
 
				+				vfloat4 error_weights(tpp * weight_imprecision_estim);
			
 
				+
			
 
				+				vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len;
			
 
				+				vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len;
			
 
				+
			
 
				+				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
			
 
				+				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
			
 
				+			}
			
 
				+
			
 
				+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
			
 
				+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0];
			
 
				+
			
 
				+	unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
			
 
				+	for (unsigned int i = 0; i < requested_candidates; i++)
			
 
				+	{
			
 
				+		if (best_is_uncor)
			
 
				+		{
			
 
				+			interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
			
 
				+			interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
			
 
				+			interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	uint64_t bitmasks[1024/64] { 0 };
			
 
				+	unsigned int emitted = 0;
			
 
				+
			
 
				+	// Deduplicate the first "requested" entries
			
 
				+	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
			
 
				+	{
			
 
				+		unsigned int partition = interleave[i];
			
 
				+
			
 
				+		unsigned int word = partition / 64;
			
 
				+		unsigned int bit = partition % 64;
			
 
				+
			
 
				+		bool written = bitmasks[word] & (1ull << bit);
			
 
				+
			
 
				+		if (!written)
			
 
				+		{
			
 
				+			best_partitions[emitted] = partition;
			
 
				+			bitmasks[word] |= 1ull << bit;
			
 
				+			emitted++;
			
 
				+
			
 
				+			if (emitted == requested_candidates)
			
 
				+			{
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return emitted;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
+++ b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
@@ -0,0 +1,1663 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for computing color endpoints and texel weights.
			
 
				+ */
			
 
				+
			
 
				+#include <cassert>
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+#include "astcenc_vecmathlib.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the infilled weight for N texel indices in a decimated grid.
			
 
				+ *
			
 
				+ * @param di        The weight grid decimation to use.
			
 
				+ * @param weights   The decimated weight values to use.
			
 
				+ * @param index     The first texel index to interpolate.
			
 
				+ *
			
 
				+ * @return The interpolated weight for the given set of SIMD_WIDTH texels.
			
 
				+ */
			
 
				+static vfloat bilinear_infill_vla(
			
 
				+	const decimation_info& di,
			
 
				+	const float* weights,
			
 
				+	unsigned int index
			
 
				+) {
			
 
				+	// Load the bilinear filter texel weight indexes in the decimated grid
			
 
				+	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
			
 
				+	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
			
 
				+	vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
			
 
				+	vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
			
 
				+
			
 
				+	// Load the bilinear filter weights from the decimated grid
			
 
				+	vfloat weight_val0 = gatherf(weights, weight_idx0);
			
 
				+	vfloat weight_val1 = gatherf(weights, weight_idx1);
			
 
				+	vfloat weight_val2 = gatherf(weights, weight_idx2);
			
 
				+	vfloat weight_val3 = gatherf(weights, weight_idx3);
			
 
				+
			
 
				+	// Load the weight contribution factors for each decimated weight
			
 
				+	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
			
 
				+	vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
			
 
				+	vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
			
 
				+	vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
			
 
				+
			
 
				+	// Compute the bilinear interpolation to generate the per-texel weight
			
 
				+	return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
			
 
				+	       (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the infilled weight for N texel indices in a decimated grid.
			
 
				+ *
			
 
				+ * This is specialized version which computes only two weights per texel for
			
 
				+ * encodings that are only decimated in a single axis.
			
 
				+ *
			
 
				+ * @param di        The weight grid decimation to use.
			
 
				+ * @param weights   The decimated weight values to use.
			
 
				+ * @param index     The first texel index to interpolate.
			
 
				+ *
			
 
				+ * @return The interpolated weight for the given set of SIMD_WIDTH texels.
			
 
				+ */
			
 
				+static vfloat bilinear_infill_vla_2(
			
 
				+	const decimation_info& di,
			
 
				+	const float* weights,
			
 
				+	unsigned int index
			
 
				+) {
			
 
				+	// Load the bilinear filter texel weight indexes in the decimated grid
			
 
				+	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
			
 
				+	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
			
 
				+
			
 
				+	// Load the bilinear filter weights from the decimated grid
			
 
				+	vfloat weight_val0 = gatherf(weights, weight_idx0);
			
 
				+	vfloat weight_val1 = gatherf(weights, weight_idx1);
			
 
				+
			
 
				+	// Load the weight contribution factors for each decimated weight
			
 
				+	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
			
 
				+	vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
			
 
				+
			
 
				+	// Compute the bilinear interpolation to generate the per-texel weight
			
 
				+	return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the ideal endpoints and weights for 1 color component.
			
 
				+ *
			
 
				+ * @param      blk         The image block color data to compress.
			
 
				+ * @param      pi          The partition info for the current trial.
			
 
				+ * @param[out] ei          The computed ideal endpoints and weights.
			
 
				+ * @param      component   The color component to compute.
			
 
				+ */
			
 
				+static void compute_ideal_colors_and_weights_1_comp(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	endpoints_and_weights& ei,
			
 
				+	unsigned int component
			
 
				+) {
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	ei.ep.partition_count = partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	unsigned int texel_count = blk.texel_count;
			
 
				+	promise(texel_count > 0);
			
 
				+
			
 
				+	float error_weight;
			
 
				+	const float* data_vr = nullptr;
			
 
				+
			
 
				+	assert(component < BLOCK_MAX_COMPONENTS);
			
 
				+	switch (component)
			
 
				+	{
			
 
				+	case 0:
			
 
				+		error_weight = blk.channel_weight.lane<0>();
			
 
				+		data_vr = blk.data_r;
			
 
				+		break;
			
 
				+	case 1:
			
 
				+		error_weight = blk.channel_weight.lane<1>();
			
 
				+		data_vr = blk.data_g;
			
 
				+		break;
			
 
				+	case 2:
			
 
				+		error_weight = blk.channel_weight.lane<2>();
			
 
				+		data_vr = blk.data_b;
			
 
				+		break;
			
 
				+	default:
			
 
				+		assert(component == 3);
			
 
				+		error_weight = blk.channel_weight.lane<3>();
			
 
				+		data_vr = blk.data_a;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	vmask4 sep_mask = vint4::lane_id() == vint4(component);
			
 
				+	bool is_constant_wes { true };
			
 
				+	float partition0_len_sq { 0.0f };
			
 
				+
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		float lowvalue { 1e10f };
			
 
				+		float highvalue { -1e10f };
			
 
				+
			
 
				+		unsigned int partition_texel_count = pi.partition_texel_count[i];
			
 
				+		for (unsigned int j = 0; j < partition_texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			float value = data_vr[tix];
			
 
				+			lowvalue = astc::min(value, lowvalue);
			
 
				+			highvalue = astc::max(value, highvalue);
			
 
				+		}
			
 
				+
			
 
				+		if (highvalue <= lowvalue)
			
 
				+		{
			
 
				+			lowvalue = 0.0f;
			
 
				+			highvalue = 1e-7f;
			
 
				+		}
			
 
				+
			
 
				+		float length = highvalue - lowvalue;
			
 
				+		float length_squared = length * length;
			
 
				+		float scale = 1.0f / length;
			
 
				+
			
 
				+		if (i == 0)
			
 
				+		{
			
 
				+			partition0_len_sq = length_squared;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int j = 0; j < partition_texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			float value = (data_vr[tix] - lowvalue) * scale;
			
 
				+			value = astc::clamp1f(value);
			
 
				+
			
 
				+			ei.weights[tix] = value;
			
 
				+			ei.weight_error_scale[tix] = length_squared * error_weight;
			
 
				+			assert(!astc::isnan(ei.weight_error_scale[tix]));
			
 
				+		}
			
 
				+
			
 
				+		ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
			
 
				+		ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
			
 
				+	}
			
 
				+
			
 
				+	// Zero initialize any SIMD over-fetch
			
 
				+	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				+	for (unsigned int i = texel_count; i < texel_count_simd; i++)
			
 
				+	{
			
 
				+		ei.weights[i] = 0.0f;
			
 
				+		ei.weight_error_scale[i] = 0.0f;
			
 
				+	}
			
 
				+
			
 
				+	ei.is_constant_weight_error_scale = is_constant_wes;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the ideal endpoints and weights for 2 color components.
			
 
				+ *
			
 
				+ * @param      blk          The image block color data to compress.
			
 
				+ * @param      pi           The partition info for the current trial.
			
 
				+ * @param[out] ei           The computed ideal endpoints and weights.
			
 
				+ * @param      component1   The first color component to compute.
			
 
				+ * @param      component2   The second color component to compute.
			
 
				+ */
			
 
				+static void compute_ideal_colors_and_weights_2_comp(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	endpoints_and_weights& ei,
			
 
				+	int component1,
			
 
				+	int component2
			
 
				+) {
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	ei.ep.partition_count = partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	unsigned int texel_count = blk.texel_count;
			
 
				+	promise(texel_count > 0);
			
 
				+
			
 
				+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	float error_weight;
			
 
				+	const float* data_vr = nullptr;
			
 
				+	const float* data_vg = nullptr;
			
 
				+
			
 
				+	if (component1 == 0 && component2 == 1)
			
 
				+	{
			
 
				+		error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
			
 
				+
			
 
				+		data_vr = blk.data_r;
			
 
				+		data_vg = blk.data_g;
			
 
				+	}
			
 
				+	else if (component1 == 0 && component2 == 2)
			
 
				+	{
			
 
				+		error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
			
 
				+
			
 
				+		data_vr = blk.data_r;
			
 
				+		data_vg = blk.data_b;
			
 
				+	}
			
 
				+	else // (component1 == 1 && component2 == 2)
			
 
				+	{
			
 
				+		assert(component1 == 1 && component2 == 2);
			
 
				+
			
 
				+		error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
			
 
				+
			
 
				+		data_vr = blk.data_g;
			
 
				+		data_vg = blk.data_b;
			
 
				+	}
			
 
				+
			
 
				+	compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
			
 
				+
			
 
				+	bool is_constant_wes { true };
			
 
				+	float partition0_len_sq { 0.0f };
			
 
				+
			
 
				+	vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
			
 
				+	vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
			
 
				+
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		vfloat4 dir = pms[i].dir;
			
 
				+		if (hadd_s(dir) < 0.0f)
			
 
				+		{
			
 
				+			dir = vfloat4::zero() - dir;
			
 
				+		}
			
 
				+
			
 
				+		line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
			
 
				+		float lowparam { 1e10f };
			
 
				+		float highparam { -1e10f };
			
 
				+
			
 
				+		unsigned int partition_texel_count = pi.partition_texel_count[i];
			
 
				+		for (unsigned int j = 0; j < partition_texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
			
 
				+			float param = dot_s(point - line.a, line.b);
			
 
				+			ei.weights[tix] = param;
			
 
				+
			
 
				+			lowparam = astc::min(param, lowparam);
			
 
				+			highparam = astc::max(param, highparam);
			
 
				+		}
			
 
				+
			
 
				+		// It is possible for a uniform-color partition to produce length=0;
			
 
				+		// this causes NaN issues so set to small value to avoid this problem
			
 
				+		if (highparam <= lowparam)
			
 
				+		{
			
 
				+			lowparam = 0.0f;
			
 
				+			highparam = 1e-7f;
			
 
				+		}
			
 
				+
			
 
				+		float length = highparam - lowparam;
			
 
				+		float length_squared = length * length;
			
 
				+		float scale = 1.0f / length;
			
 
				+
			
 
				+		if (i == 0)
			
 
				+		{
			
 
				+			partition0_len_sq = length_squared;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int j = 0; j < partition_texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			float idx = (ei.weights[tix] - lowparam) * scale;
			
 
				+			idx = astc::clamp1f(idx);
			
 
				+
			
 
				+			ei.weights[tix] = idx;
			
 
				+			ei.weight_error_scale[tix] = length_squared * error_weight;
			
 
				+			assert(!astc::isnan(ei.weight_error_scale[tix]));
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 lowvalue = line.a + line.b * lowparam;
			
 
				+		vfloat4 highvalue = line.a + line.b * highparam;
			
 
				+
			
 
				+		vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
			
 
				+		vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
			
 
				+
			
 
				+		ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
			
 
				+		ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
			
 
				+	}
			
 
				+
			
 
				+	// Zero initialize any SIMD over-fetch
			
 
				+	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				+	for (unsigned int i = texel_count; i < texel_count_simd; i++)
			
 
				+	{
			
 
				+		ei.weights[i] = 0.0f;
			
 
				+		ei.weight_error_scale[i] = 0.0f;
			
 
				+	}
			
 
				+
			
 
				+	ei.is_constant_weight_error_scale = is_constant_wes;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the ideal endpoints and weights for 3 color components.
			
 
				+ *
			
 
				+ * @param      blk                 The image block color data to compress.
			
 
				+ * @param      pi                  The partition info for the current trial.
			
 
				+ * @param[out] ei                  The computed ideal endpoints and weights.
			
 
				+ * @param      omitted_component   The color component excluded from the calculation.
			
 
				+ */
			
 
				+static void compute_ideal_colors_and_weights_3_comp(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	endpoints_and_weights& ei,
			
 
				+	unsigned int omitted_component
			
 
				+) {
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+	ei.ep.partition_count = partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	unsigned int texel_count = blk.texel_count;
			
 
				+	promise(texel_count > 0);
			
 
				+
			
 
				+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	float error_weight;
			
 
				+	const float* data_vr = nullptr;
			
 
				+	const float* data_vg = nullptr;
			
 
				+	const float* data_vb = nullptr;
			
 
				+	if (omitted_component == 0)
			
 
				+	{
			
 
				+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
			
 
				+		data_vr = blk.data_g;
			
 
				+		data_vg = blk.data_b;
			
 
				+		data_vb = blk.data_a;
			
 
				+	}
			
 
				+	else if (omitted_component == 1)
			
 
				+	{
			
 
				+		error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
			
 
				+		data_vr = blk.data_r;
			
 
				+		data_vg = blk.data_b;
			
 
				+		data_vb = blk.data_a;
			
 
				+	}
			
 
				+	else if (omitted_component == 2)
			
 
				+	{
			
 
				+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
			
 
				+		data_vr = blk.data_r;
			
 
				+		data_vg = blk.data_g;
			
 
				+		data_vb = blk.data_a;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		assert(omitted_component == 3);
			
 
				+
			
 
				+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
			
 
				+		data_vr = blk.data_r;
			
 
				+		data_vg = blk.data_g;
			
 
				+		data_vb = blk.data_b;
			
 
				+	}
			
 
				+
			
 
				+	error_weight = error_weight * (1.0f / 3.0f);
			
 
				+
			
 
				+	if (omitted_component == 3)
			
 
				+	{
			
 
				+		compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
			
 
				+	}
			
 
				+
			
 
				+	bool is_constant_wes { true };
			
 
				+	float partition0_len_sq { 0.0f };
			
 
				+
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		vfloat4 dir = pms[i].dir;
			
 
				+		if (hadd_rgb_s(dir) < 0.0f)
			
 
				+		{
			
 
				+			dir = vfloat4::zero() - dir;
			
 
				+		}
			
 
				+
			
 
				+		line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
			
 
				+		float lowparam { 1e10f };
			
 
				+		float highparam { -1e10f };
			
 
				+
			
 
				+		unsigned int partition_texel_count = pi.partition_texel_count[i];
			
 
				+		for (unsigned int j = 0; j < partition_texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
			
 
				+			float param = dot3_s(point - line.a, line.b);
			
 
				+			ei.weights[tix] = param;
			
 
				+
			
 
				+			lowparam = astc::min(param, lowparam);
			
 
				+			highparam = astc::max(param, highparam);
			
 
				+		}
			
 
				+
			
 
				+		// It is possible for a uniform-color partition to produce length=0;
			
 
				+		// this causes NaN issues so set to small value to avoid this problem
			
 
				+		if (highparam <= lowparam)
			
 
				+		{
			
 
				+			lowparam = 0.0f;
			
 
				+			highparam = 1e-7f;
			
 
				+		}
			
 
				+
			
 
				+		float length = highparam - lowparam;
			
 
				+		float length_squared = length * length;
			
 
				+		float scale = 1.0f / length;
			
 
				+
			
 
				+		if (i == 0)
			
 
				+		{
			
 
				+			partition0_len_sq = length_squared;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int j = 0; j < partition_texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			float idx = (ei.weights[tix] - lowparam) * scale;
			
 
				+			idx = astc::clamp1f(idx);
			
 
				+
			
 
				+			ei.weights[tix] = idx;
			
 
				+			ei.weight_error_scale[tix] = length_squared * error_weight;
			
 
				+			assert(!astc::isnan(ei.weight_error_scale[tix]));
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 ep0 = line.a + line.b * lowparam;
			
 
				+		vfloat4 ep1 = line.a + line.b * highparam;
			
 
				+
			
 
				+		vfloat4 bmin = blk.data_min;
			
 
				+		vfloat4 bmax = blk.data_max;
			
 
				+
			
 
				+		assert(omitted_component < BLOCK_MAX_COMPONENTS);
			
 
				+		switch (omitted_component)
			
 
				+		{
			
 
				+			case 0:
			
 
				+				ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
			
 
				+				ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
			
 
				+				break;
			
 
				+			case 1:
			
 
				+				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
			
 
				+				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
			
 
				+				break;
			
 
				+			case 2:
			
 
				+				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
			
 
				+				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
			
 
				+				break;
			
 
				+			default:
			
 
				+				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
			
 
				+				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
			
 
				+				break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Zero initialize any SIMD over-fetch
			
 
				+	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				+	for (unsigned int i = texel_count; i < texel_count_simd; i++)
			
 
				+	{
			
 
				+		ei.weights[i] = 0.0f;
			
 
				+		ei.weight_error_scale[i] = 0.0f;
			
 
				+	}
			
 
				+
			
 
				+	ei.is_constant_weight_error_scale = is_constant_wes;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the ideal endpoints and weights for 4 color components.
			
 
				+ *
			
 
				+ * @param      blk   The image block color data to compress.
			
 
				+ * @param      pi    The partition info for the current trial.
			
 
				+ * @param[out] ei    The computed ideal endpoints and weights.
			
 
				+ */
			
 
				+static void compute_ideal_colors_and_weights_4_comp(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	endpoints_and_weights& ei
			
 
				+) {
			
 
				+	const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
			
 
				+
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+
			
 
				+	unsigned int texel_count = blk.texel_count;
			
 
				+	promise(texel_count > 0);
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	compute_avgs_and_dirs_4_comp(pi, blk, pms);
			
 
				+
			
 
				+	bool is_constant_wes { true };
			
 
				+	float partition0_len_sq { 0.0f };
			
 
				+
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		vfloat4 dir = pms[i].dir;
			
 
				+		if (hadd_rgb_s(dir) < 0.0f)
			
 
				+		{
			
 
				+			dir = vfloat4::zero() - dir;
			
 
				+		}
			
 
				+
			
 
				+		line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
			
 
				+		float lowparam { 1e10f };
			
 
				+		float highparam { -1e10f };
			
 
				+
			
 
				+		unsigned int partition_texel_count = pi.partition_texel_count[i];
			
 
				+		for (unsigned int j = 0; j < partition_texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			vfloat4 point = blk.texel(tix);
			
 
				+			float param = dot_s(point - line.a, line.b);
			
 
				+			ei.weights[tix] = param;
			
 
				+
			
 
				+			lowparam = astc::min(param, lowparam);
			
 
				+			highparam = astc::max(param, highparam);
			
 
				+		}
			
 
				+
			
 
				+		// It is possible for a uniform-color partition to produce length=0;
			
 
				+		// this causes NaN issues so set to small value to avoid this problem
			
 
				+		if (highparam <= lowparam)
			
 
				+		{
			
 
				+			lowparam = 0.0f;
			
 
				+			highparam = 1e-7f;
			
 
				+		}
			
 
				+
			
 
				+		float length = highparam - lowparam;
			
 
				+		float length_squared = length * length;
			
 
				+		float scale = 1.0f / length;
			
 
				+
			
 
				+		if (i == 0)
			
 
				+		{
			
 
				+			partition0_len_sq = length_squared;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
			
 
				+		}
			
 
				+
			
 
				+		ei.ep.endpt0[i] = line.a + line.b * lowparam;
			
 
				+		ei.ep.endpt1[i] = line.a + line.b * highparam;
			
 
				+
			
 
				+		for (unsigned int j = 0; j < partition_texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				+			float idx = (ei.weights[tix] - lowparam) * scale;
			
 
				+			idx = astc::clamp1f(idx);
			
 
				+
			
 
				+			ei.weights[tix] = idx;
			
 
				+			ei.weight_error_scale[tix] = length_squared * error_weight;
			
 
				+			assert(!astc::isnan(ei.weight_error_scale[tix]));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Zero initialize any SIMD over-fetch
			
 
				+	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				+	for (unsigned int i = texel_count; i < texel_count_simd; i++)
			
 
				+	{
			
 
				+		ei.weights[i] = 0.0f;
			
 
				+		ei.weight_error_scale[i] = 0.0f;
			
 
				+	}
			
 
				+
			
 
				+	ei.is_constant_weight_error_scale = is_constant_wes;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_ideal_colors_and_weights_1plane(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	endpoints_and_weights& ei
			
 
				+) {
			
 
				+	bool uses_alpha = !blk.is_constant_channel(3);
			
 
				+
			
 
				+	if (uses_alpha)
			
 
				+	{
			
 
				+		compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_ideal_colors_and_weights_2planes(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int plane2_component,
			
 
				+	endpoints_and_weights& ei1,
			
 
				+	endpoints_and_weights& ei2
			
 
				+) {
			
 
				+	const auto& pi = bsd.get_partition_info(1, 0);
			
 
				+	bool uses_alpha = !blk.is_constant_channel(3);
			
 
				+
			
 
				+	assert(plane2_component < BLOCK_MAX_COMPONENTS);
			
 
				+	switch (plane2_component)
			
 
				+	{
			
 
				+	case 0: // Separate weights for red
			
 
				+		if (uses_alpha)
			
 
				+		{
			
 
				+			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
			
 
				+		}
			
 
				+		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
			
 
				+		break;
			
 
				+
			
 
				+	case 1: // Separate weights for green
			
 
				+		if (uses_alpha)
			
 
				+		{
			
 
				+			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
			
 
				+		}
			
 
				+		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
			
 
				+		break;
			
 
				+
			
 
				+	case 2: // Separate weights for blue
			
 
				+		if (uses_alpha)
			
 
				+		{
			
 
				+			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
			
 
				+		}
			
 
				+		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
			
 
				+		break;
			
 
				+
			
 
				+	default: // Separate weights for alpha
			
 
				+		assert(uses_alpha);
			
 
				+		compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
			
 
				+		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+float compute_error_of_weight_set_1plane(
			
 
				+	const endpoints_and_weights& eai,
			
 
				+	const decimation_info& di,
			
 
				+	const float* dec_weight_quant_uvalue
			
 
				+) {
			
 
				+	vfloatacc error_summav = vfloatacc::zero();
			
 
				+	unsigned int texel_count = di.texel_count;
			
 
				+	promise(texel_count > 0);
			
 
				+
			
 
				+	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
			
 
				+	if (di.max_texel_weight_count > 2)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			// Compute the bilinear interpolation of the decimated weight grid
			
 
				+			vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values = loada(eai.weights + i);
			
 
				+			vfloat diff = current_values - actual_values;
			
 
				+			vfloat significance = loada(eai.weight_error_scale + i);
			
 
				+			vfloat error = diff * diff * significance;
			
 
				+
			
 
				+			haccumulate(error_summav, error);
			
 
				+		}
			
 
				+	}
			
 
				+	else if (di.max_texel_weight_count > 1)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			// Compute the bilinear interpolation of the decimated weight grid
			
 
				+			vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values = loada(eai.weights + i);
			
 
				+			vfloat diff = current_values - actual_values;
			
 
				+			vfloat significance = loada(eai.weight_error_scale + i);
			
 
				+			vfloat error = diff * diff * significance;
			
 
				+
			
 
				+			haccumulate(error_summav, error);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			// Load the weight set directly, without interpolation
			
 
				+			vfloat current_values = loada(dec_weight_quant_uvalue + i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values = loada(eai.weights + i);
			
 
				+			vfloat diff = current_values - actual_values;
			
 
				+			vfloat significance = loada(eai.weight_error_scale + i);
			
 
				+			vfloat error = diff * diff * significance;
			
 
				+
			
 
				+			haccumulate(error_summav, error);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Resolve the final scalar accumulator sum
			
 
				+	return hadd_s(error_summav);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+float compute_error_of_weight_set_2planes(
			
 
				+	const endpoints_and_weights& eai1,
			
 
				+	const endpoints_and_weights& eai2,
			
 
				+	const decimation_info& di,
			
 
				+	const float* dec_weight_quant_uvalue_plane1,
			
 
				+	const float* dec_weight_quant_uvalue_plane2
			
 
				+) {
			
 
				+	vfloatacc error_summav = vfloatacc::zero();
			
 
				+	unsigned int texel_count = di.texel_count;
			
 
				+	promise(texel_count > 0);
			
 
				+
			
 
				+	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
			
 
				+	if (di.max_texel_weight_count > 2)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			// Plane 1
			
 
				+			// Compute the bilinear interpolation of the decimated weight grid
			
 
				+			vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values1 = loada(eai1.weights + i);
			
 
				+			vfloat diff = current_values1 - actual_values1;
			
 
				+			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
			
 
				+
			
 
				+			// Plane 2
			
 
				+			// Compute the bilinear interpolation of the decimated weight grid
			
 
				+			vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values2 = loada(eai2.weights + i);
			
 
				+			diff = current_values2 - actual_values2;
			
 
				+			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
			
 
				+
			
 
				+			haccumulate(error_summav, error1 + error2);
			
 
				+		}
			
 
				+	}
			
 
				+	else if (di.max_texel_weight_count > 1)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			// Plane 1
			
 
				+			// Compute the bilinear interpolation of the decimated weight grid
			
 
				+			vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values1 = loada(eai1.weights + i);
			
 
				+			vfloat diff = current_values1 - actual_values1;
			
 
				+			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
			
 
				+
			
 
				+			// Plane 2
			
 
				+			// Compute the bilinear interpolation of the decimated weight grid
			
 
				+			vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values2 = loada(eai2.weights + i);
			
 
				+			diff = current_values2 - actual_values2;
			
 
				+			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
			
 
				+
			
 
				+			haccumulate(error_summav, error1 + error2);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			// Plane 1
			
 
				+			// Load the weight set directly, without interpolation
			
 
				+			vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values1 = loada(eai1.weights + i);
			
 
				+			vfloat diff = current_values1 - actual_values1;
			
 
				+			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
			
 
				+
			
 
				+			// Plane 2
			
 
				+			// Load the weight set directly, without interpolation
			
 
				+			vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
			
 
				+
			
 
				+			// Compute the error between the computed value and the ideal weight
			
 
				+			vfloat actual_values2 = loada(eai2.weights + i);
			
 
				+			diff = current_values2 - actual_values2;
			
 
				+			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
			
 
				+
			
 
				+			haccumulate(error_summav, error1 + error2);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Resolve the final scalar accumulator sum
			
 
				+	return hadd_s(error_summav);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_ideal_weights_for_decimation(
			
 
				+	const endpoints_and_weights& ei,
			
 
				+	const decimation_info& di,
			
 
				+	float* dec_weight_ideal_value
			
 
				+) {
			
 
				+	unsigned int texel_count = di.texel_count;
			
 
				+	unsigned int weight_count = di.weight_count;
			
 
				+	bool is_direct = texel_count == weight_count;
			
 
				+	promise(texel_count > 0);
			
 
				+	promise(weight_count > 0);
			
 
				+
			
 
				+	// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
			
 
				+	// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
			
 
				+	// arrays always contain space for 64 elements
			
 
				+	unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
			
 
				+	storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
			
 
				+
			
 
				+	// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
			
 
				+	// zero-initialized SIMD over-fetch region
			
 
				+	if (is_direct)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat weight(ei.weights + i);
			
 
				+			storea(weight, dec_weight_ideal_value + i);
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Otherwise compute an estimate and perform single refinement iteration
			
 
				+	alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	// Compute an initial average for each decimated weight
			
 
				+	bool constant_wes = ei.is_constant_weight_error_scale;
			
 
				+	vfloat weight_error_scale(ei.weight_error_scale[0]);
			
 
				+
			
 
				+	// This overshoots - this is OK as we initialize the array tails in the
			
 
				+	// decimation table structures to safe values ...
			
 
				+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		// Start with a small value to avoid div-by-zero later
			
 
				+		vfloat weight_weight(1e-10f);
			
 
				+		vfloat initial_weight = vfloat::zero();
			
 
				+
			
 
				+		// Accumulate error weighting of all the texels using this weight
			
 
				+		vint weight_texel_count(di.weight_texel_count + i);
			
 
				+		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
			
 
				+		promise(max_texel_count > 0);
			
 
				+
			
 
				+		for (unsigned int j = 0; j < max_texel_count; j++)
			
 
				+		{
			
 
				+			vint texel(di.weight_texels_tr[j] + i);
			
 
				+			vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
			
 
				+
			
 
				+			if (!constant_wes)
			
 
				+			{
			
 
				+				weight_error_scale = gatherf(ei.weight_error_scale, texel);
			
 
				+			}
			
 
				+
			
 
				+			vfloat contrib_weight = weight * weight_error_scale;
			
 
				+
			
 
				+			weight_weight += contrib_weight;
			
 
				+			initial_weight += gatherf(ei.weights, texel) * contrib_weight;
			
 
				+		}
			
 
				+
			
 
				+		storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
			
 
				+	}
			
 
				+
			
 
				+	// Populate the interpolated weight grid based on the initial average
			
 
				+	// Process SIMD-width texel coordinates at at time while we can. Safe to
			
 
				+	// over-process full SIMD vectors - the tail is zeroed.
			
 
				+	if (di.max_texel_weight_count <= 2)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
			
 
				+			storea(weight, infilled_weights + i);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
			
 
				+			storea(weight, infilled_weights + i);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Perform a single iteration of refinement
			
 
				+	// Empirically determined step size; larger values don't help but smaller drops image quality
			
 
				+	constexpr float stepsize = 0.25f;
			
 
				+	constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
			
 
				+
			
 
				+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		vfloat weight_val = loada(dec_weight_ideal_value + i);
			
 
				+
			
 
				+		// Accumulate error weighting of all the texels using this weight
			
 
				+		// Start with a small value to avoid div-by-zero later
			
 
				+		vfloat error_change0(1e-10f);
			
 
				+		vfloat error_change1(0.0f);
			
 
				+
			
 
				+		// Accumulate error weighting of all the texels using this weight
			
 
				+		vint weight_texel_count(di.weight_texel_count + i);
			
 
				+		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
			
 
				+		promise(max_texel_count > 0);
			
 
				+
			
 
				+		for (unsigned int j = 0; j < max_texel_count; j++)
			
 
				+		{
			
 
				+			vint texel(di.weight_texels_tr[j] + i);
			
 
				+			vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
			
 
				+
			
 
				+			if (!constant_wes)
			
 
				+			{
			
 
				+ 				weight_error_scale = gatherf(ei.weight_error_scale, texel);
			
 
				+			}
			
 
				+
			
 
				+			vfloat scale = weight_error_scale * contrib_weight;
			
 
				+			vfloat old_weight = gatherf(infilled_weights, texel);
			
 
				+			vfloat ideal_weight = gatherf(ei.weights, texel);
			
 
				+
			
 
				+			error_change0 += contrib_weight * scale;
			
 
				+			error_change1 += (old_weight - ideal_weight) * scale;
			
 
				+		}
			
 
				+
			
 
				+		vfloat step = (error_change1 * chd_scale) / error_change0;
			
 
				+		step = clamp(-stepsize, stepsize, step);
			
 
				+
			
 
				+		// Update the weight; note this can store negative values
			
 
				+		storea(weight_val + step, dec_weight_ideal_value + i);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_quantized_weights_for_decimation(
			
 
				+	const decimation_info& di,
			
 
				+	float low_bound,
			
 
				+	float high_bound,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	float* weight_set_out,
			
 
				+	uint8_t* quantized_weight_set,
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	int weight_count = di.weight_count;
			
 
				+	promise(weight_count > 0);
			
 
				+	const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
			
 
				+
			
 
				+	// The available quant levels, stored with a minus 1 bias
			
 
				+	static const float quant_levels_m1[12] {
			
 
				+		1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
			
 
				+	};
			
 
				+
			
 
				+	vint steps_m1(get_quant_level(quant_level) - 1);
			
 
				+	float quant_level_m1 = quant_levels_m1[quant_level];
			
 
				+
			
 
				+	// Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
			
 
				+
			
 
				+	// TODO: Oddity to investigate; triggered by test in issue #265.
			
 
				+	if (high_bound <= low_bound)
			
 
				+	{
			
 
				+		low_bound = 0.0f;
			
 
				+		high_bound = 1.0f;
			
 
				+	}
			
 
				+
			
 
				+	float rscale = high_bound - low_bound;
			
 
				+	float scale = 1.0f / rscale;
			
 
				+
			
 
				+	float scaled_low_bound = low_bound * scale;
			
 
				+	rscale *= 1.0f / 64.0f;
			
 
				+
			
 
				+	vfloat scalev(scale);
			
 
				+	vfloat scaled_low_boundv(scaled_low_bound);
			
 
				+	vfloat quant_level_m1v(quant_level_m1);
			
 
				+	vfloat rscalev(rscale);
			
 
				+	vfloat low_boundv(low_bound);
			
 
				+
			
 
				+	// This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
			
 
				+	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
			
 
				+	if (get_quant_level(quant_level) <= 16)
			
 
				+	{
			
 
				+		vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
			
 
				+		vint tab0p;
			
 
				+		vtable_prepare(tab0, tab0p);
			
 
				+
			
 
				+		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
			
 
				+			ix = clampzo(ix);
			
 
				+
			
 
				+			// Look up the two closest indexes and return the one that was closest
			
 
				+			vfloat ix1 = ix * quant_level_m1v;
			
 
				+
			
 
				+			vint weightl = float_to_int(ix1);
			
 
				+			vint weighth = min(weightl + vint(1), steps_m1);
			
 
				+
			
 
				+			vint ixli = vtable_8bt_32bi(tab0p, weightl);
			
 
				+			vint ixhi = vtable_8bt_32bi(tab0p, weighth);
			
 
				+
			
 
				+			vfloat ixl = int_to_float(ixli);
			
 
				+			vfloat ixh = int_to_float(ixhi);
			
 
				+
			
 
				+			vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
			
 
				+			vint weight = select(ixli, ixhi, mask);
			
 
				+			ixl = select(ixl, ixh, mask);
			
 
				+
			
 
				+			// Invert the weight-scaling that was done initially
			
 
				+			storea(ixl * rscalev + low_boundv, weight_set_out + i);
			
 
				+			vint scn = pack_low_bytes(weight);
			
 
				+			store_nbytes(scn, quantized_weight_set + i);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
			
 
				+		vint4 tab1(reinterpret_cast<const int*>(qat.quant_to_unquant + 16));
			
 
				+		vint tab0p, tab1p;
			
 
				+		vtable_prepare(tab0, tab1, tab0p, tab1p);
			
 
				+
			
 
				+		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
			
 
				+			ix = clampzo(ix);
			
 
				+
			
 
				+			// Look up the two closest indexes and return the one that was closest
			
 
				+			vfloat ix1 = ix * quant_level_m1v;
			
 
				+
			
 
				+			vint weightl = float_to_int(ix1);
			
 
				+			vint weighth = min(weightl + vint(1), steps_m1);
			
 
				+
			
 
				+			vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
			
 
				+			vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
			
 
				+
			
 
				+			vfloat ixl = int_to_float(ixli);
			
 
				+			vfloat ixh = int_to_float(ixhi);
			
 
				+
			
 
				+			vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
			
 
				+			vint weight = select(ixli, ixhi, mask);
			
 
				+			ixl = select(ixl, ixh, mask);
			
 
				+
			
 
				+			// Invert the weight-scaling that was done initially
			
 
				+			storea(ixl * rscalev + low_boundv, weight_set_out + i);
			
 
				+			vint scn = pack_low_bytes(weight);
			
 
				+			store_nbytes(scn, quantized_weight_set + i);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the RGB + offset for a HDR endpoint mode #7.
			
 
				+ *
			
 
				+ * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
			
 
				+ * gives us ~24 multiplications vs. 96 for a generic inverse.
			
 
				+ *
			
 
				+ *  mat[0] = vfloat4(rgba_ws.x,      0.0f,      0.0f, wght_ws.x);
			
 
				+ *  mat[1] = vfloat4(     0.0f, rgba_ws.y,      0.0f, wght_ws.y);
			
 
				+ *  mat[2] = vfloat4(     0.0f,      0.0f, rgba_ws.z, wght_ws.z);
			
 
				+ *  mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z,      psum);
			
 
				+ *  mat = invert(mat);
			
 
				+ *
			
 
				+ * @param rgba_weight_sum     Sum of partition component error weights.
			
 
				+ * @param weight_weight_sum   Sum of partition component error weights * texel weight.
			
 
				+ * @param rgbq_sum            Sum of partition component error weights * texel weight * color data.
			
 
				+ * @param psum                Sum of RGB color weights * texel weight^2.
			
 
				+ */
			
 
				+static inline vfloat4 compute_rgbo_vector(
			
 
				+	vfloat4 rgba_weight_sum,
			
 
				+	vfloat4 weight_weight_sum,
			
 
				+	vfloat4 rgbq_sum,
			
 
				+	float psum
			
 
				+) {
			
 
				+	float X = rgba_weight_sum.lane<0>();
			
 
				+	float Y = rgba_weight_sum.lane<1>();
			
 
				+	float Z = rgba_weight_sum.lane<2>();
			
 
				+	float P = weight_weight_sum.lane<0>();
			
 
				+	float Q = weight_weight_sum.lane<1>();
			
 
				+	float R = weight_weight_sum.lane<2>();
			
 
				+	float S = psum;
			
 
				+
			
 
				+	float PP = P * P;
			
 
				+	float QQ = Q * Q;
			
 
				+	float RR = R * R;
			
 
				+
			
 
				+	float SZmRR = S * Z - RR;
			
 
				+	float DT = SZmRR * Y - Z * QQ;
			
 
				+	float YP = Y * P;
			
 
				+	float QX = Q * X;
			
 
				+	float YX = Y * X;
			
 
				+	float mZYP = -Z * YP;
			
 
				+	float mZQX = -Z * QX;
			
 
				+	float mRYX = -R * YX;
			
 
				+	float ZQP = Z * Q * P;
			
 
				+	float RYP = R * YP;
			
 
				+	float RQX = R * QX;
			
 
				+
			
 
				+	// Compute the reciprocal of matrix determinant
			
 
				+	float rdet = 1.0f / (DT * X + mZYP * P);
			
 
				+
			
 
				+	// Actually compute the adjugate, and then apply 1/det separately
			
 
				+	vfloat4 mat0(DT, ZQP, RYP, mZYP);
			
 
				+	vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
			
 
				+	vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
			
 
				+	vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
			
 
				+	vfloat4 vect = rgbq_sum * rdet;
			
 
				+
			
 
				+	return vfloat4(dot_s(mat0, vect),
			
 
				+	               dot_s(mat1, vect),
			
 
				+	               dot_s(mat2, vect),
			
 
				+	               dot_s(mat3, vect));
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void recompute_ideal_colors_1plane(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	const decimation_info& di,
			
 
				+	const uint8_t* dec_weights_uquant,
			
 
				+	endpoints& ep,
			
 
				+	vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
			
 
				+	vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
			
 
				+) {
			
 
				+	unsigned int weight_count = di.weight_count;
			
 
				+	unsigned int total_texel_count = blk.texel_count;
			
 
				+	unsigned int partition_count = pi.partition_count;
			
 
				+
			
 
				+	promise(weight_count > 0);
			
 
				+	promise(total_texel_count > 0);
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
			
 
				+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		vint unquant_value(dec_weights_uquant + i);
			
 
				+		vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
			
 
				+		storea(unquant_valuef, dec_weight + i);
			
 
				+	}
			
 
				+
			
 
				+	alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
			
 
				+	float* undec_weight_ref;
			
 
				+	if (di.max_texel_weight_count == 1)
			
 
				+	{
			
 
				+		undec_weight_ref = dec_weight;
			
 
				+	}
			
 
				+	else if (di.max_texel_weight_count <= 2)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
			
 
				+			storea(weight, undec_weight + i);
			
 
				+		}
			
 
				+
			
 
				+		undec_weight_ref = undec_weight;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat weight = bilinear_infill_vla(di, dec_weight, i);
			
 
				+			storea(weight, undec_weight + i);
			
 
				+		}
			
 
				+
			
 
				+		undec_weight_ref = undec_weight;
			
 
				+	}
			
 
				+
			
 
				+	vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
			
 
				+
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		unsigned int texel_count = pi.partition_texel_count[i];
			
 
				+		const uint8_t *texel_indexes = pi.texels_of_partition[i];
			
 
				+
			
 
				+		// Only compute a partition mean if more than one partition
			
 
				+		if (partition_count > 1)
			
 
				+		{
			
 
				+			rgba_sum = vfloat4::zero();
			
 
				+			promise(texel_count > 0);
			
 
				+			for (unsigned int j = 0; j < texel_count; j++)
			
 
				+			{
			
 
				+				unsigned int tix = texel_indexes[j];
			
 
				+				rgba_sum += blk.texel(tix);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		rgba_sum = rgba_sum * blk.channel_weight;
			
 
				+		vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
			
 
				+		vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
			
 
				+
			
 
				+		float scale_max = 0.0f;
			
 
				+		float scale_min = 1e10f;
			
 
				+
			
 
				+		float wmin1 = 1.0f;
			
 
				+		float wmax1 = 0.0f;
			
 
				+
			
 
				+		float left_sum_s = 0.0f;
			
 
				+		float middle_sum_s = 0.0f;
			
 
				+		float right_sum_s = 0.0f;
			
 
				+
			
 
				+		vfloat4 color_vec_x = vfloat4::zero();
			
 
				+		vfloat4 color_vec_y = vfloat4::zero();
			
 
				+
			
 
				+		vfloat4 scale_vec = vfloat4::zero();
			
 
				+
			
 
				+		float weight_weight_sum_s = 1e-17f;
			
 
				+
			
 
				+		vfloat4 color_weight = blk.channel_weight;
			
 
				+		float ls_weight = hadd_rgb_s(color_weight);
			
 
				+
			
 
				+		for (unsigned int j = 0; j < texel_count; j++)
			
 
				+		{
			
 
				+			unsigned int tix = texel_indexes[j];
			
 
				+			vfloat4 rgba = blk.texel(tix);
			
 
				+
			
 
				+			float idx0 = undec_weight_ref[tix];
			
 
				+
			
 
				+			float om_idx0 = 1.0f - idx0;
			
 
				+			wmin1 = astc::min(idx0, wmin1);
			
 
				+			wmax1 = astc::max(idx0, wmax1);
			
 
				+
			
 
				+			float scale = dot3_s(scale_dir, rgba);
			
 
				+			scale_min = astc::min(scale, scale_min);
			
 
				+			scale_max = astc::max(scale, scale_max);
			
 
				+
			
 
				+			left_sum_s   += om_idx0 * om_idx0;
			
 
				+			middle_sum_s += om_idx0 * idx0;
			
 
				+			right_sum_s  += idx0 * idx0;
			
 
				+			weight_weight_sum_s += idx0;
			
 
				+
			
 
				+			vfloat4 color_idx(idx0);
			
 
				+			vfloat4 cwprod = rgba;
			
 
				+			vfloat4 cwiprod = cwprod * color_idx;
			
 
				+
			
 
				+			color_vec_y += cwiprod;
			
 
				+			color_vec_x += cwprod - cwiprod;
			
 
				+
			
 
				+			scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
			
 
				+		}
			
 
				+
			
 
				+		vfloat4 left_sum   = vfloat4(left_sum_s) * color_weight;
			
 
				+		vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
			
 
				+		vfloat4 right_sum  = vfloat4(right_sum_s) * color_weight;
			
 
				+		vfloat4 lmrs_sum   = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
			
 
				+
			
 
				+		color_vec_x = color_vec_x * color_weight;
			
 
				+		color_vec_y = color_vec_y * color_weight;
			
 
				+
			
 
				+		// Initialize the luminance and scale vectors with a reasonable default
			
 
				+		float scalediv = scale_min / astc::max(scale_max, 1e-10f);
			
 
				+		scalediv = astc::clamp1f(scalediv);
			
 
				+
			
 
				+		vfloat4 sds = scale_dir * scale_max;
			
 
				+
			
 
				+		rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
			
 
				+
			
 
				+		if (wmin1 >= wmax1 * 0.999f)
			
 
				+		{
			
 
				+			// If all weights in the partition were equal, then just take average of all colors in
			
 
				+			// the partition and use that as both endpoint colors
			
 
				+			vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
			
 
				+
			
 
				+			vmask4 notnan_mask = avg == avg;
			
 
				+			ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
			
 
				+			ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
			
 
				+
			
 
				+			rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
			
 
				+			// set of texel weights and pixel colors
			
 
				+			vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
			
 
				+			vfloat4 color_rdet1 = 1.0f / color_det1;
			
 
				+
			
 
				+			float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
			
 
				+			float ls_rdet1 = 1.0f / ls_det1;
			
 
				+
			
 
				+			vfloat4 color_mss1 = (left_sum * left_sum)
			
 
				+			                   + (2.0f * middle_sum * middle_sum)
			
 
				+			                   + (right_sum * right_sum);
			
 
				+
			
 
				+			float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
			
 
				+			              + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
			
 
				+			              + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
			
 
				+
			
 
				+			vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
			
 
				+			vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
			
 
				+
			
 
				+			vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
			
 
				+			vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
			
 
				+			vmask4 full_mask = det_mask & notnan_mask;
			
 
				+
			
 
				+			ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
			
 
				+			ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
			
 
				+
			
 
				+			float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
			
 
				+			float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
			
 
				+
			
 
				+			if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
			
 
				+			{
			
 
				+				float scalediv2 = scale_ep0 / scale_ep1;
			
 
				+				vfloat4 sdsm = scale_dir * scale_ep1;
			
 
				+				rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
			
 
				+		if (blk.rgb_lns[0] || blk.alpha_lns[0])
			
 
				+		{
			
 
				+			vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
			
 
				+			float psum = right_sum_s * hadd_rgb_s(color_weight);
			
 
				+
			
 
				+			vfloat4 rgbq_sum = color_vec_x + color_vec_y;
			
 
				+			rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
			
 
				+
			
 
				+			vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
			
 
				+			rgbo_vectors[i] = rgbovec;
			
 
				+
			
 
				+			// We can get a failure due to the use of a singular (non-invertible) matrix
			
 
				+			// If it failed, compute rgbo_vectors[] with a different method ...
			
 
				+			if (astc::isnan(dot_s(rgbovec, rgbovec)))
			
 
				+			{
			
 
				+				vfloat4 v0 = ep.endpt0[i];
			
 
				+				vfloat4 v1 = ep.endpt1[i];
			
 
				+
			
 
				+				float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
			
 
				+				avgdif = astc::max(avgdif, 0.0f);
			
 
				+
			
 
				+				vfloat4 avg = (v0 + v1) * 0.5f;
			
 
				+				vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
			
 
				+				rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void recompute_ideal_colors_2planes(
			
 
				+	const image_block& blk,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const decimation_info& di,
			
 
				+	const uint8_t* dec_weights_uquant_plane1,
			
 
				+	const uint8_t* dec_weights_uquant_plane2,
			
 
				+	endpoints& ep,
			
 
				+	vfloat4& rgbs_vector,
			
 
				+	vfloat4& rgbo_vector,
			
 
				+	int plane2_component
			
 
				+) {
			
 
				+	unsigned int weight_count = di.weight_count;
			
 
				+	unsigned int total_texel_count = blk.texel_count;
			
 
				+
			
 
				+	promise(total_texel_count > 0);
			
 
				+	promise(weight_count > 0);
			
 
				+
			
 
				+	alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
			
 
				+	alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
			
 
				+
			
 
				+	assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
			
 
				+
			
 
				+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		vint unquant_value1(dec_weights_uquant_plane1 + i);
			
 
				+		vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
			
 
				+		storea(unquant_value1f, dec_weight_plane1 + i);
			
 
				+
			
 
				+		vint unquant_value2(dec_weights_uquant_plane2 + i);
			
 
				+		vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
			
 
				+		storea(unquant_value2f, dec_weight_plane2 + i);
			
 
				+	}
			
 
				+
			
 
				+	alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
			
 
				+	alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	float* undec_weight_plane1_ref;
			
 
				+	float* undec_weight_plane2_ref;
			
 
				+
			
 
				+	if (di.max_texel_weight_count == 1)
			
 
				+	{
			
 
				+		undec_weight_plane1_ref = dec_weight_plane1;
			
 
				+		undec_weight_plane2_ref = dec_weight_plane2;
			
 
				+	}
			
 
				+	else if (di.max_texel_weight_count <= 2)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
			
 
				+			storea(weight, undec_weight_plane1 + i);
			
 
				+
			
 
				+			weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
			
 
				+			storea(weight, undec_weight_plane2 + i);
			
 
				+		}
			
 
				+
			
 
				+		undec_weight_plane1_ref = undec_weight_plane1;
			
 
				+		undec_weight_plane2_ref = undec_weight_plane2;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
			
 
				+			storea(weight, undec_weight_plane1 + i);
			
 
				+
			
 
				+			weight = bilinear_infill_vla(di, dec_weight_plane2, i);
			
 
				+			storea(weight, undec_weight_plane2 + i);
			
 
				+		}
			
 
				+
			
 
				+		undec_weight_plane1_ref = undec_weight_plane1;
			
 
				+		undec_weight_plane2_ref = undec_weight_plane2;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int texel_count = bsd.texel_count;
			
 
				+	vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
			
 
				+	vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
			
 
				+
			
 
				+	float scale_max = 0.0f;
			
 
				+	float scale_min = 1e10f;
			
 
				+
			
 
				+	float wmin1 = 1.0f;
			
 
				+	float wmax1 = 0.0f;
			
 
				+
			
 
				+	float wmin2 = 1.0f;
			
 
				+	float wmax2 = 0.0f;
			
 
				+
			
 
				+	float left1_sum_s = 0.0f;
			
 
				+	float middle1_sum_s = 0.0f;
			
 
				+	float right1_sum_s = 0.0f;
			
 
				+
			
 
				+	float left2_sum_s = 0.0f;
			
 
				+	float middle2_sum_s = 0.0f;
			
 
				+	float right2_sum_s = 0.0f;
			
 
				+
			
 
				+	vfloat4 color_vec_x = vfloat4::zero();
			
 
				+	vfloat4 color_vec_y = vfloat4::zero();
			
 
				+
			
 
				+	vfloat4 scale_vec = vfloat4::zero();
			
 
				+
			
 
				+	vfloat4 weight_weight_sum = vfloat4(1e-17f);
			
 
				+
			
 
				+	vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
			
 
				+	vfloat4 color_weight = blk.channel_weight;
			
 
				+	float ls_weight = hadd_rgb_s(color_weight);
			
 
				+
			
 
				+	for (unsigned int j = 0; j < texel_count; j++)
			
 
				+	{
			
 
				+		vfloat4 rgba = blk.texel(j);
			
 
				+
			
 
				+		float idx0 = undec_weight_plane1_ref[j];
			
 
				+
			
 
				+		float om_idx0 = 1.0f - idx0;
			
 
				+		wmin1 = astc::min(idx0, wmin1);
			
 
				+		wmax1 = astc::max(idx0, wmax1);
			
 
				+
			
 
				+		float scale = dot3_s(scale_dir, rgba);
			
 
				+		scale_min = astc::min(scale, scale_min);
			
 
				+		scale_max = astc::max(scale, scale_max);
			
 
				+
			
 
				+		left1_sum_s   += om_idx0 * om_idx0;
			
 
				+		middle1_sum_s += om_idx0 * idx0;
			
 
				+		right1_sum_s  += idx0 * idx0;
			
 
				+
			
 
				+		float idx1 = undec_weight_plane2_ref[j];
			
 
				+
			
 
				+		float om_idx1 = 1.0f - idx1;
			
 
				+		wmin2 = astc::min(idx1, wmin2);
			
 
				+		wmax2 = astc::max(idx1, wmax2);
			
 
				+
			
 
				+		left2_sum_s   += om_idx1 * om_idx1;
			
 
				+		middle2_sum_s += om_idx1 * idx1;
			
 
				+		right2_sum_s  += idx1 * idx1;
			
 
				+
			
 
				+		vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
			
 
				+
			
 
				+		vfloat4 cwprod = rgba;
			
 
				+		vfloat4 cwiprod = cwprod * color_idx;
			
 
				+
			
 
				+		color_vec_y += cwiprod;
			
 
				+		color_vec_x += cwprod - cwiprod;
			
 
				+
			
 
				+		scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
			
 
				+		weight_weight_sum += color_idx;
			
 
				+	}
			
 
				+
			
 
				+	vfloat4 left1_sum   = vfloat4(left1_sum_s) * color_weight;
			
 
				+	vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
			
 
				+	vfloat4 right1_sum  = vfloat4(right1_sum_s) * color_weight;
			
 
				+	vfloat4 lmrs_sum    = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
			
 
				+
			
 
				+	vfloat4 left2_sum   = vfloat4(left2_sum_s) * color_weight;
			
 
				+	vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
			
 
				+	vfloat4 right2_sum  = vfloat4(right2_sum_s) * color_weight;
			
 
				+
			
 
				+	color_vec_x = color_vec_x * color_weight;
			
 
				+	color_vec_y = color_vec_y * color_weight;
			
 
				+
			
 
				+	// Initialize the luminance and scale vectors with a reasonable default
			
 
				+	float scalediv = scale_min / astc::max(scale_max, 1e-10f);
			
 
				+	scalediv = astc::clamp1f(scalediv);
			
 
				+
			
 
				+	vfloat4 sds = scale_dir * scale_max;
			
 
				+
			
 
				+	rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
			
 
				+
			
 
				+	if (wmin1 >= wmax1 * 0.999f)
			
 
				+	{
			
 
				+		// If all weights in the partition were equal, then just take average of all colors in
			
 
				+		// the partition and use that as both endpoint colors
			
 
				+		vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
			
 
				+
			
 
				+		vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
			
 
				+		vmask4 notnan_mask = avg == avg;
			
 
				+		vmask4 full_mask = p1_mask & notnan_mask;
			
 
				+
			
 
				+		ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
			
 
				+		ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
			
 
				+
			
 
				+		rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
			
 
				+		// set of texel weights and pixel colors
			
 
				+		vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
			
 
				+		vfloat4 color_rdet1 = 1.0f / color_det1;
			
 
				+
			
 
				+		float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
			
 
				+		float ls_rdet1 = 1.0f / ls_det1;
			
 
				+
			
 
				+		vfloat4 color_mss1 = (left1_sum * left1_sum)
			
 
				+		                   + (2.0f * middle1_sum * middle1_sum)
			
 
				+		                   + (right1_sum * right1_sum);
			
 
				+
			
 
				+		float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
			
 
				+		              + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
			
 
				+		              + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
			
 
				+
			
 
				+		vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
			
 
				+		vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
			
 
				+
			
 
				+		float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
			
 
				+		float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
			
 
				+
			
 
				+		vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
			
 
				+		vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
			
 
				+		vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
			
 
				+		vmask4 full_mask = p1_mask & det_mask & notnan_mask;
			
 
				+
			
 
				+		ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
			
 
				+		ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
			
 
				+
			
 
				+		if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
			
 
				+		{
			
 
				+			float scalediv2 = scale_ep0 / scale_ep1;
			
 
				+			vfloat4 sdsm = scale_dir * scale_ep1;
			
 
				+			rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (wmin2 >= wmax2 * 0.999f)
			
 
				+	{
			
 
				+		// If all weights in the partition were equal, then just take average of all colors in
			
 
				+		// the partition and use that as both endpoint colors
			
 
				+		vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
			
 
				+
			
 
				+		vmask4 notnan_mask = avg == avg;
			
 
				+		vmask4 full_mask = p2_mask & notnan_mask;
			
 
				+
			
 
				+		ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
			
 
				+		ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
			
 
				+		// set of texel weights and pixel colors
			
 
				+		vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
			
 
				+		vfloat4 color_rdet2 = 1.0f / color_det2;
			
 
				+
			
 
				+		vfloat4 color_mss2 = (left2_sum * left2_sum)
			
 
				+		                   + (2.0f * middle2_sum * middle2_sum)
			
 
				+		                   + (right2_sum * right2_sum);
			
 
				+
			
 
				+		vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
			
 
				+		vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
			
 
				+
			
 
				+		vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
			
 
				+		vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
			
 
				+		vmask4 full_mask = p2_mask & det_mask & notnan_mask;
			
 
				+
			
 
				+		ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
			
 
				+		ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
			
 
				+	}
			
 
				+
			
 
				+	// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
			
 
				+	if (blk.rgb_lns[0] || blk.alpha_lns[0])
			
 
				+	{
			
 
				+		weight_weight_sum = weight_weight_sum * color_weight;
			
 
				+		float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
			
 
				+
			
 
				+		vfloat4 rgbq_sum = color_vec_x + color_vec_y;
			
 
				+		rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
			
 
				+
			
 
				+		rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
			
 
				+
			
 
				+		// We can get a failure due to the use of a singular (non-invertible) matrix
			
 
				+		// If it failed, compute rgbo_vectors[] with a different method ...
			
 
				+		if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
			
 
				+		{
			
 
				+			vfloat4 v0 = ep.endpt0[0];
			
 
				+			vfloat4 v1 = ep.endpt1[0];
			
 
				+
			
 
				+			float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
			
 
				+			avgdif = astc::max(avgdif, 0.0f);
			
 
				+
			
 
				+			vfloat4 avg = (v0 + v1) * 0.5f;
			
 
				+			vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
			
 
				+
			
 
				+			rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_image.cpp
+++ b/thirdparty/astcenc/astcenc_image.cpp
@@ -0,0 +1,558 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for creating in-memory ASTC image structures.
			
 
				+ */
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <cstring>
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Loader pipeline function type for data fetch from memory.
			
 
				+ */
			
 
				+using pixel_loader = vfloat4(*)(const void*, int);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Loader pipeline function type for swizzling data in a vector.
			
 
				+ */
			
 
				+using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Loader pipeline function type for converting data in a vector to LNS.
			
 
				+ */
			
 
				+using pixel_converter = vfloat4(*)(vfloat4, vmask4);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a 8-bit UNORM texel from a data array.
			
 
				+ *
			
 
				+ * @param data          The data pointer.
			
 
				+ * @param base_offset   The index offset to the start of the pixel.
			
 
				+ */
			
 
				+static vfloat4 load_texel_u8(
			
 
				+	const void* data,
			
 
				+	int base_offset
			
 
				+) {
			
 
				+	const uint8_t* data8 = static_cast<const uint8_t*>(data);
			
 
				+	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a 16-bit fp16 texel from a data array.
			
 
				+ *
			
 
				+ * @param data          The data pointer.
			
 
				+ * @param base_offset   The index offset to the start of the pixel.
			
 
				+ */
			
 
				+static vfloat4 load_texel_f16(
			
 
				+	const void* data,
			
 
				+	int base_offset
			
 
				+) {
			
 
				+	const uint16_t* data16 = static_cast<const uint16_t*>(data);
			
 
				+	int r = data16[base_offset    ];
			
 
				+	int g = data16[base_offset + 1];
			
 
				+	int b = data16[base_offset + 2];
			
 
				+	int a = data16[base_offset + 3];
			
 
				+	return float16_to_float(vint4(r, g, b, a));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a 32-bit float texel from a data array.
			
 
				+ *
			
 
				+ * @param data          The data pointer.
			
 
				+ * @param base_offset   The index offset to the start of the pixel.
			
 
				+ */
			
 
				+static vfloat4 load_texel_f32(
			
 
				+	const void* data,
			
 
				+	int base_offset
			
 
				+) {
			
 
				+	const float* data32 = static_cast<const float*>(data);
			
 
				+	return vfloat4(data32 + base_offset);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Dummy no-op swizzle function.
			
 
				+ *
			
 
				+ * @param data   The source RGBA vector to swizzle.
			
 
				+ * @param swz    The swizzle to use.
			
 
				+ */
			
 
				+static vfloat4 swz_texel_skip(
			
 
				+	vfloat4 data,
			
 
				+	const astcenc_swizzle& swz
			
 
				+) {
			
 
				+	(void)swz;
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Swizzle a texel into a new arrangement.
			
 
				+ *
			
 
				+ * @param data   The source RGBA vector to swizzle.
			
 
				+ * @param swz    The swizzle to use.
			
 
				+ */
			
 
				+static vfloat4 swz_texel(
			
 
				+	vfloat4 data,
			
 
				+	const astcenc_swizzle& swz
			
 
				+) {
			
 
				+	alignas(16) float datas[6];
			
 
				+
			
 
				+	storea(data, datas);
			
 
				+	datas[ASTCENC_SWZ_0] = 0.0f;
			
 
				+	datas[ASTCENC_SWZ_1] = 1.0f;
			
 
				+
			
 
				+	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Encode a texel that is entirely LDR linear.
			
 
				+ *
			
 
				+ * @param data       The RGBA data to encode.
			
 
				+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
			
 
				+ */
			
 
				+static vfloat4 encode_texel_unorm(
			
 
				+	vfloat4 data,
			
 
				+	vmask4 lns_mask
			
 
				+) {
			
 
				+	(void)lns_mask;
			
 
				+	return data * 65535.0f;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Encode a texel that includes at least some HDR LNS texels.
			
 
				+ *
			
 
				+ * @param data       The RGBA data to encode.
			
 
				+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
			
 
				+ */
			
 
				+static vfloat4 encode_texel_lns(
			
 
				+	vfloat4 data,
			
 
				+	vmask4 lns_mask
			
 
				+) {
			
 
				+	vfloat4 datav_unorm = data * 65535.0f;
			
 
				+	vfloat4 datav_lns = float_to_lns(data);
			
 
				+	return select(datav_unorm, datav_lns, lns_mask);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void load_image_block(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const astcenc_image& img,
			
 
				+	image_block& blk,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	unsigned int xpos,
			
 
				+	unsigned int ypos,
			
 
				+	unsigned int zpos,
			
 
				+	const astcenc_swizzle& swz
			
 
				+) {
			
 
				+	unsigned int xsize = img.dim_x;
			
 
				+	unsigned int ysize = img.dim_y;
			
 
				+	unsigned int zsize = img.dim_z;
			
 
				+
			
 
				+	blk.xpos = xpos;
			
 
				+	blk.ypos = ypos;
			
 
				+	blk.zpos = zpos;
			
 
				+
			
 
				+	// True if any non-identity swizzle
			
 
				+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
			
 
				+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
			
 
				+
			
 
				+	int idx = 0;
			
 
				+
			
 
				+	vfloat4 data_min(1e38f);
			
 
				+	vfloat4 data_mean(0.0f);
			
 
				+	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
			
 
				+	vfloat4 data_max(-1e38f);
			
 
				+	vmask4 grayscalev(true);
			
 
				+
			
 
				+	// This works because we impose the same choice everywhere during encode
			
 
				+	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
			
 
				+	                  (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
			
 
				+	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
			
 
				+	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
			
 
				+	vmask4 lns_mask = use_lns != vint4::zero();
			
 
				+
			
 
				+	// Set up the function pointers for loading pipeline as needed
			
 
				+	pixel_loader loader = load_texel_u8;
			
 
				+	if (img.data_type == ASTCENC_TYPE_F16)
			
 
				+	{
			
 
				+		loader = load_texel_f16;
			
 
				+	}
			
 
				+	else if  (img.data_type == ASTCENC_TYPE_F32)
			
 
				+	{
			
 
				+		loader = load_texel_f32;
			
 
				+	}
			
 
				+
			
 
				+	pixel_swizzler swizzler = swz_texel_skip;
			
 
				+	if (needs_swz)
			
 
				+	{
			
 
				+		swizzler = swz_texel;
			
 
				+	}
			
 
				+
			
 
				+	pixel_converter converter = encode_texel_unorm;
			
 
				+	if (any(lns_mask))
			
 
				+	{
			
 
				+		converter = encode_texel_lns;
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int z = 0; z < bsd.zdim; z++)
			
 
				+	{
			
 
				+		unsigned int zi = astc::min(zpos + z, zsize - 1);
			
 
				+		void* plane = img.data[zi];
			
 
				+
			
 
				+		for (unsigned int y = 0; y < bsd.ydim; y++)
			
 
				+		{
			
 
				+			unsigned int yi = astc::min(ypos + y, ysize - 1);
			
 
				+
			
 
				+			for (unsigned int x = 0; x < bsd.xdim; x++)
			
 
				+			{
			
 
				+				unsigned int xi = astc::min(xpos + x, xsize - 1);
			
 
				+
			
 
				+				vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
			
 
				+				datav = swizzler(datav, swz);
			
 
				+				datav = converter(datav, lns_mask);
			
 
				+
			
 
				+				// Compute block metadata
			
 
				+				data_min = min(data_min, datav);
			
 
				+				data_mean += datav * data_mean_scale;
			
 
				+				data_max = max(data_max, datav);
			
 
				+
			
 
				+				grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
			
 
				+
			
 
				+				blk.data_r[idx] = datav.lane<0>();
			
 
				+				blk.data_g[idx] = datav.lane<1>();
			
 
				+				blk.data_b[idx] = datav.lane<2>();
			
 
				+				blk.data_a[idx] = datav.lane<3>();
			
 
				+
			
 
				+				blk.rgb_lns[idx] = rgb_lns;
			
 
				+				blk.alpha_lns[idx] = a_lns;
			
 
				+
			
 
				+				idx++;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Reverse the encoding so we store origin block in the original format
			
 
				+	vfloat4 data_enc = blk.texel(0);
			
 
				+	vfloat4 data_enc_unorm = data_enc / 65535.0f;
			
 
				+	vfloat4 data_enc_lns = vfloat4::zero();
			
 
				+
			
 
				+	if (rgb_lns || a_lns)
			
 
				+	{
			
 
				+		data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
			
 
				+	}
			
 
				+
			
 
				+	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
			
 
				+
			
 
				+	// Store block metadata
			
 
				+	blk.data_min = data_min;
			
 
				+	blk.data_mean = data_mean;
			
 
				+	blk.data_max = data_max;
			
 
				+	blk.grayscale = all(grayscalev);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void load_image_block_fast_ldr(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const astcenc_image& img,
			
 
				+	image_block& blk,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	unsigned int xpos,
			
 
				+	unsigned int ypos,
			
 
				+	unsigned int zpos,
			
 
				+	const astcenc_swizzle& swz
			
 
				+) {
			
 
				+	(void)swz;
			
 
				+	(void)decode_mode;
			
 
				+
			
 
				+	unsigned int xsize = img.dim_x;
			
 
				+	unsigned int ysize = img.dim_y;
			
 
				+
			
 
				+	blk.xpos = xpos;
			
 
				+	blk.ypos = ypos;
			
 
				+	blk.zpos = zpos;
			
 
				+
			
 
				+	vfloat4 data_min(1e38f);
			
 
				+	vfloat4 data_mean = vfloat4::zero();
			
 
				+	vfloat4 data_max(-1e38f);
			
 
				+	vmask4 grayscalev(true);
			
 
				+	int idx = 0;
			
 
				+
			
 
				+	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
			
 
				+	for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
			
 
				+	{
			
 
				+		unsigned int yi = astc::min(y, ysize - 1);
			
 
				+
			
 
				+		for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
			
 
				+		{
			
 
				+			unsigned int xi = astc::min(x, xsize - 1);
			
 
				+
			
 
				+			vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
			
 
				+			vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
			
 
				+
			
 
				+			// Compute block metadata
			
 
				+			data_min = min(data_min, datav);
			
 
				+			data_mean += datav;
			
 
				+			data_max = max(data_max, datav);
			
 
				+
			
 
				+			grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
			
 
				+
			
 
				+			blk.data_r[idx] = datav.lane<0>();
			
 
				+			blk.data_g[idx] = datav.lane<1>();
			
 
				+			blk.data_b[idx] = datav.lane<2>();
			
 
				+			blk.data_a[idx] = datav.lane<3>();
			
 
				+
			
 
				+			idx++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Reverse the encoding so we store origin block in the original format
			
 
				+	blk.origin_texel = blk.texel(0) / 65535.0f;
			
 
				+
			
 
				+	// Store block metadata
			
 
				+	blk.rgb_lns[0] = 0;
			
 
				+	blk.alpha_lns[0] = 0;
			
 
				+	blk.data_min = data_min;
			
 
				+	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
			
 
				+	blk.data_max = data_max;
			
 
				+	blk.grayscale = all(grayscalev);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void store_image_block(
			
 
				+	astcenc_image& img,
			
 
				+	const image_block& blk,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	unsigned int xpos,
			
 
				+	unsigned int ypos,
			
 
				+	unsigned int zpos,
			
 
				+	const astcenc_swizzle& swz
			
 
				+) {
			
 
				+	unsigned int x_size = img.dim_x;
			
 
				+	unsigned int x_start = xpos;
			
 
				+	unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
			
 
				+	unsigned int x_count = x_end - x_start;
			
 
				+	unsigned int x_nudge = bsd.xdim - x_count;
			
 
				+
			
 
				+	unsigned int y_size = img.dim_y;
			
 
				+	unsigned int y_start = ypos;
			
 
				+	unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
			
 
				+	unsigned int y_count = y_end - y_start;
			
 
				+	unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
			
 
				+
			
 
				+	unsigned int z_size = img.dim_z;
			
 
				+	unsigned int z_start = zpos;
			
 
				+	unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
			
 
				+
			
 
				+	// True if any non-identity swizzle
			
 
				+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
			
 
				+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
			
 
				+
			
 
				+	// True if any swizzle uses Z reconstruct
			
 
				+	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
			
 
				+	               (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
			
 
				+
			
 
				+	int idx = 0;
			
 
				+	if (img.data_type == ASTCENC_TYPE_U8)
			
 
				+	{
			
 
				+		for (unsigned int z = z_start; z < z_end; z++)
			
 
				+		{
			
 
				+			// Fetch the image plane
			
 
				+			uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
			
 
				+
			
 
				+			for (unsigned int y = y_start; y < y_end; y++)
			
 
				+			{
			
 
				+				uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
			
 
				+
			
 
				+				for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
			
 
				+				{
			
 
				+					unsigned int max_texels = ASTCENC_SIMD_WIDTH;
			
 
				+					unsigned int used_texels = astc::min(x_count - x, max_texels);
			
 
				+
			
 
				+					// Unaligned load as rows are not always SIMD_WIDTH long
			
 
				+					vfloat data_r(blk.data_r + idx);
			
 
				+					vfloat data_g(blk.data_g + idx);
			
 
				+					vfloat data_b(blk.data_b + idx);
			
 
				+					vfloat data_a(blk.data_a + idx);
			
 
				+
			
 
				+					vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
			
 
				+					vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
			
 
				+					vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
			
 
				+					vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
			
 
				+
			
 
				+					if (needs_swz)
			
 
				+					{
			
 
				+						vint swizzle_table[7];
			
 
				+						swizzle_table[ASTCENC_SWZ_0] = vint(0);
			
 
				+						swizzle_table[ASTCENC_SWZ_1] = vint(255);
			
 
				+						swizzle_table[ASTCENC_SWZ_R] = data_ri;
			
 
				+						swizzle_table[ASTCENC_SWZ_G] = data_gi;
			
 
				+						swizzle_table[ASTCENC_SWZ_B] = data_bi;
			
 
				+						swizzle_table[ASTCENC_SWZ_A] = data_ai;
			
 
				+
			
 
				+						if (needs_z)
			
 
				+						{
			
 
				+							vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
			
 
				+							vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
			
 
				+							vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
			
 
				+							data_z = max(data_z, 0.0f);
			
 
				+							data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
			
 
				+
			
 
				+							swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
			
 
				+						}
			
 
				+
			
 
				+						data_ri = swizzle_table[swz.r];
			
 
				+						data_gi = swizzle_table[swz.g];
			
 
				+						data_bi = swizzle_table[swz.b];
			
 
				+						data_ai = swizzle_table[swz.a];
			
 
				+					}
			
 
				+
			
 
				+					// Errors are NaN encoded - convert to magenta error color
			
 
				+					// Branch is OK here - it is almost never true so predicts well
			
 
				+					vmask nan_mask = data_r != data_r;
			
 
				+					if (any(nan_mask))
			
 
				+					{
			
 
				+						data_ri = select(data_ri, vint(0xFF), nan_mask);
			
 
				+						data_gi = select(data_gi, vint(0x00), nan_mask);
			
 
				+						data_bi = select(data_bi, vint(0xFF), nan_mask);
			
 
				+						data_ai = select(data_ai, vint(0xFF), nan_mask);
			
 
				+					}
			
 
				+
			
 
				+					vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
			
 
				+					vmask store_mask = vint::lane_id() < vint(used_texels);
			
 
				+					store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
			
 
				+
			
 
				+					data8_row += ASTCENC_SIMD_WIDTH * 4;
			
 
				+					idx += used_texels;
			
 
				+				}
			
 
				+				idx += x_nudge;
			
 
				+			}
			
 
				+			idx += y_nudge;
			
 
				+		}
			
 
				+	}
			
 
				+	else if (img.data_type == ASTCENC_TYPE_F16)
			
 
				+	{
			
 
				+		for (unsigned int z = z_start; z < z_end; z++)
			
 
				+		{
			
 
				+			// Fetch the image plane
			
 
				+			uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
			
 
				+
			
 
				+			for (unsigned int y = y_start; y < y_end; y++)
			
 
				+			{
			
 
				+				uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
			
 
				+
			
 
				+				for (unsigned int x = 0; x < x_count; x++)
			
 
				+				{
			
 
				+					vint4 color;
			
 
				+
			
 
				+					// NaNs are handled inline - no need to special case
			
 
				+					if (needs_swz)
			
 
				+					{
			
 
				+						float data[7];
			
 
				+						data[ASTCENC_SWZ_0] = 0.0f;
			
 
				+						data[ASTCENC_SWZ_1] = 1.0f;
			
 
				+						data[ASTCENC_SWZ_R] = blk.data_r[idx];
			
 
				+						data[ASTCENC_SWZ_G] = blk.data_g[idx];
			
 
				+						data[ASTCENC_SWZ_B] = blk.data_b[idx];
			
 
				+						data[ASTCENC_SWZ_A] = blk.data_a[idx];
			
 
				+
			
 
				+						if (needs_z)
			
 
				+						{
			
 
				+							float xN = (data[0] * 2.0f) - 1.0f;
			
 
				+							float yN = (data[3] * 2.0f) - 1.0f;
			
 
				+							float zN = 1.0f - xN * xN - yN * yN;
			
 
				+							if (zN < 0.0f)
			
 
				+							{
			
 
				+								zN = 0.0f;
			
 
				+							}
			
 
				+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
			
 
				+						}
			
 
				+
			
 
				+						vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
			
 
				+						color = float_to_float16(colorf);
			
 
				+					}
			
 
				+					else
			
 
				+					{
			
 
				+						vfloat4 colorf = blk.texel(idx);
			
 
				+						color = float_to_float16(colorf);
			
 
				+					}
			
 
				+
			
 
				+					// TODO: Vectorize with store N shorts?
			
 
				+					data16_row[0] = static_cast<uint16_t>(color.lane<0>());
			
 
				+					data16_row[1] = static_cast<uint16_t>(color.lane<1>());
			
 
				+					data16_row[2] = static_cast<uint16_t>(color.lane<2>());
			
 
				+					data16_row[3] = static_cast<uint16_t>(color.lane<3>());
			
 
				+					data16_row += 4;
			
 
				+					idx++;
			
 
				+				}
			
 
				+				idx += x_nudge;
			
 
				+			}
			
 
				+			idx += y_nudge;
			
 
				+		}
			
 
				+	}
			
 
				+	else // if (img.data_type == ASTCENC_TYPE_F32)
			
 
				+	{
			
 
				+		assert(img.data_type == ASTCENC_TYPE_F32);
			
 
				+
			
 
				+		for (unsigned int z = z_start; z < z_end; z++)
			
 
				+		{
			
 
				+			// Fetch the image plane
			
 
				+			float* data32 = static_cast<float*>(img.data[z]);
			
 
				+
			
 
				+			for (unsigned int y = y_start; y < y_end; y++)
			
 
				+			{
			
 
				+				float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
			
 
				+
			
 
				+				for (unsigned int x = 0; x < x_count; x++)
			
 
				+				{
			
 
				+					vfloat4 color = blk.texel(idx);
			
 
				+
			
 
				+					// NaNs are handled inline - no need to special case
			
 
				+					if (needs_swz)
			
 
				+					{
			
 
				+						float data[7];
			
 
				+						data[ASTCENC_SWZ_0] = 0.0f;
			
 
				+						data[ASTCENC_SWZ_1] = 1.0f;
			
 
				+						data[ASTCENC_SWZ_R] = color.lane<0>();
			
 
				+						data[ASTCENC_SWZ_G] = color.lane<1>();
			
 
				+						data[ASTCENC_SWZ_B] = color.lane<2>();
			
 
				+						data[ASTCENC_SWZ_A] = color.lane<3>();
			
 
				+
			
 
				+						if (needs_z)
			
 
				+						{
			
 
				+							float xN = (data[0] * 2.0f) - 1.0f;
			
 
				+							float yN = (data[3] * 2.0f) - 1.0f;
			
 
				+							float zN = 1.0f - xN * xN - yN * yN;
			
 
				+							if (zN < 0.0f)
			
 
				+							{
			
 
				+								zN = 0.0f;
			
 
				+							}
			
 
				+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
			
 
				+						}
			
 
				+
			
 
				+						color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
			
 
				+					}
			
 
				+
			
 
				+					store(color, data32_row);
			
 
				+					data32_row += 4;
			
 
				+					idx++;
			
 
				+				}
			
 
				+				idx += x_nudge;
			
 
				+			}
			
 
				+			idx += y_nudge;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_integer_sequence.cpp
+++ b/thirdparty/astcenc/astcenc_integer_sequence.cpp
@@ -0,0 +1,739 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for encoding/decoding Bounded Integer Sequence Encoding.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#include <array>
			
 
				+
			
 
				+/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
			
 
				+// TODO: Bitpack these into a uint16_t?
			
 
				+static const uint8_t quints_of_integer[128][3] {
			
 
				+	{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
			
 
				+	{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
			
 
				+	{0, 1, 0}, {1, 1, 0}, {2, 1, 0}, {3, 1, 0},
			
 
				+	{4, 1, 0}, {1, 4, 0}, {4, 4, 1}, {4, 4, 4},
			
 
				+	{0, 2, 0}, {1, 2, 0}, {2, 2, 0}, {3, 2, 0},
			
 
				+	{4, 2, 0}, {2, 4, 0}, {4, 4, 2}, {4, 4, 4},
			
 
				+	{0, 3, 0}, {1, 3, 0}, {2, 3, 0}, {3, 3, 0},
			
 
				+	{4, 3, 0}, {3, 4, 0}, {4, 4, 3}, {4, 4, 4},
			
 
				+	{0, 0, 1}, {1, 0, 1}, {2, 0, 1}, {3, 0, 1},
			
 
				+	{4, 0, 1}, {0, 4, 1}, {4, 0, 4}, {0, 4, 4},
			
 
				+	{0, 1, 1}, {1, 1, 1}, {2, 1, 1}, {3, 1, 1},
			
 
				+	{4, 1, 1}, {1, 4, 1}, {4, 1, 4}, {1, 4, 4},
			
 
				+	{0, 2, 1}, {1, 2, 1}, {2, 2, 1}, {3, 2, 1},
			
 
				+	{4, 2, 1}, {2, 4, 1}, {4, 2, 4}, {2, 4, 4},
			
 
				+	{0, 3, 1}, {1, 3, 1}, {2, 3, 1}, {3, 3, 1},
			
 
				+	{4, 3, 1}, {3, 4, 1}, {4, 3, 4}, {3, 4, 4},
			
 
				+	{0, 0, 2}, {1, 0, 2}, {2, 0, 2}, {3, 0, 2},
			
 
				+	{4, 0, 2}, {0, 4, 2}, {2, 0, 4}, {3, 0, 4},
			
 
				+	{0, 1, 2}, {1, 1, 2}, {2, 1, 2}, {3, 1, 2},
			
 
				+	{4, 1, 2}, {1, 4, 2}, {2, 1, 4}, {3, 1, 4},
			
 
				+	{0, 2, 2}, {1, 2, 2}, {2, 2, 2}, {3, 2, 2},
			
 
				+	{4, 2, 2}, {2, 4, 2}, {2, 2, 4}, {3, 2, 4},
			
 
				+	{0, 3, 2}, {1, 3, 2}, {2, 3, 2}, {3, 3, 2},
			
 
				+	{4, 3, 2}, {3, 4, 2}, {2, 3, 4}, {3, 3, 4},
			
 
				+	{0, 0, 3}, {1, 0, 3}, {2, 0, 3}, {3, 0, 3},
			
 
				+	{4, 0, 3}, {0, 4, 3}, {0, 0, 4}, {1, 0, 4},
			
 
				+	{0, 1, 3}, {1, 1, 3}, {2, 1, 3}, {3, 1, 3},
			
 
				+	{4, 1, 3}, {1, 4, 3}, {0, 1, 4}, {1, 1, 4},
			
 
				+	{0, 2, 3}, {1, 2, 3}, {2, 2, 3}, {3, 2, 3},
			
 
				+	{4, 2, 3}, {2, 4, 3}, {0, 2, 4}, {1, 2, 4},
			
 
				+	{0, 3, 3}, {1, 3, 3}, {2, 3, 3}, {3, 3, 3},
			
 
				+	{4, 3, 3}, {3, 4, 3}, {0, 3, 4}, {1, 3, 4}
			
 
				+};
			
 
				+
			
 
				+/** @brief Packed quint values for each unpacked value, indexed [hi][mid][lo]. */
			
 
				+static const uint8_t integer_of_quints[5][5][5] {
			
 
				+	{
			
 
				+		{0, 1, 2, 3, 4},
			
 
				+		{8, 9, 10, 11, 12},
			
 
				+		{16, 17, 18, 19, 20},
			
 
				+		{24, 25, 26, 27, 28},
			
 
				+		{5, 13, 21, 29, 6}
			
 
				+	},
			
 
				+	{
			
 
				+		{32, 33, 34, 35, 36},
			
 
				+		{40, 41, 42, 43, 44},
			
 
				+		{48, 49, 50, 51, 52},
			
 
				+		{56, 57, 58, 59, 60},
			
 
				+		{37, 45, 53, 61, 14}
			
 
				+	},
			
 
				+	{
			
 
				+		{64, 65, 66, 67, 68},
			
 
				+		{72, 73, 74, 75, 76},
			
 
				+		{80, 81, 82, 83, 84},
			
 
				+		{88, 89, 90, 91, 92},
			
 
				+		{69, 77, 85, 93, 22}
			
 
				+	},
			
 
				+	{
			
 
				+		{96, 97, 98, 99, 100},
			
 
				+		{104, 105, 106, 107, 108},
			
 
				+		{112, 113, 114, 115, 116},
			
 
				+		{120, 121, 122, 123, 124},
			
 
				+		{101, 109, 117, 125, 30}
			
 
				+	},
			
 
				+	{
			
 
				+		{102, 103, 70, 71, 38},
			
 
				+		{110, 111, 78, 79, 46},
			
 
				+		{118, 119, 86, 87, 54},
			
 
				+		{126, 127, 94, 95, 62},
			
 
				+		{39, 47, 55, 63, 31}
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
			
 
				+// TODO: Bitpack these into a uint16_t?
			
 
				+static const uint8_t trits_of_integer[256][5] {
			
 
				+	{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
			
 
				+	{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
			
 
				+	{0, 2, 0, 0, 0}, {1, 2, 0, 0, 0}, {2, 2, 0, 0, 0}, {2, 0, 2, 0, 0},
			
 
				+	{0, 2, 2, 0, 0}, {1, 2, 2, 0, 0}, {2, 2, 2, 0, 0}, {2, 0, 2, 0, 0},
			
 
				+	{0, 0, 1, 0, 0}, {1, 0, 1, 0, 0}, {2, 0, 1, 0, 0}, {0, 1, 2, 0, 0},
			
 
				+	{0, 1, 1, 0, 0}, {1, 1, 1, 0, 0}, {2, 1, 1, 0, 0}, {1, 1, 2, 0, 0},
			
 
				+	{0, 2, 1, 0, 0}, {1, 2, 1, 0, 0}, {2, 2, 1, 0, 0}, {2, 1, 2, 0, 0},
			
 
				+	{0, 0, 0, 2, 2}, {1, 0, 0, 2, 2}, {2, 0, 0, 2, 2}, {0, 0, 2, 2, 2},
			
 
				+	{0, 0, 0, 1, 0}, {1, 0, 0, 1, 0}, {2, 0, 0, 1, 0}, {0, 0, 2, 1, 0},
			
 
				+	{0, 1, 0, 1, 0}, {1, 1, 0, 1, 0}, {2, 1, 0, 1, 0}, {1, 0, 2, 1, 0},
			
 
				+	{0, 2, 0, 1, 0}, {1, 2, 0, 1, 0}, {2, 2, 0, 1, 0}, {2, 0, 2, 1, 0},
			
 
				+	{0, 2, 2, 1, 0}, {1, 2, 2, 1, 0}, {2, 2, 2, 1, 0}, {2, 0, 2, 1, 0},
			
 
				+	{0, 0, 1, 1, 0}, {1, 0, 1, 1, 0}, {2, 0, 1, 1, 0}, {0, 1, 2, 1, 0},
			
 
				+	{0, 1, 1, 1, 0}, {1, 1, 1, 1, 0}, {2, 1, 1, 1, 0}, {1, 1, 2, 1, 0},
			
 
				+	{0, 2, 1, 1, 0}, {1, 2, 1, 1, 0}, {2, 2, 1, 1, 0}, {2, 1, 2, 1, 0},
			
 
				+	{0, 1, 0, 2, 2}, {1, 1, 0, 2, 2}, {2, 1, 0, 2, 2}, {1, 0, 2, 2, 2},
			
 
				+	{0, 0, 0, 2, 0}, {1, 0, 0, 2, 0}, {2, 0, 0, 2, 0}, {0, 0, 2, 2, 0},
			
 
				+	{0, 1, 0, 2, 0}, {1, 1, 0, 2, 0}, {2, 1, 0, 2, 0}, {1, 0, 2, 2, 0},
			
 
				+	{0, 2, 0, 2, 0}, {1, 2, 0, 2, 0}, {2, 2, 0, 2, 0}, {2, 0, 2, 2, 0},
			
 
				+	{0, 2, 2, 2, 0}, {1, 2, 2, 2, 0}, {2, 2, 2, 2, 0}, {2, 0, 2, 2, 0},
			
 
				+	{0, 0, 1, 2, 0}, {1, 0, 1, 2, 0}, {2, 0, 1, 2, 0}, {0, 1, 2, 2, 0},
			
 
				+	{0, 1, 1, 2, 0}, {1, 1, 1, 2, 0}, {2, 1, 1, 2, 0}, {1, 1, 2, 2, 0},
			
 
				+	{0, 2, 1, 2, 0}, {1, 2, 1, 2, 0}, {2, 2, 1, 2, 0}, {2, 1, 2, 2, 0},
			
 
				+	{0, 2, 0, 2, 2}, {1, 2, 0, 2, 2}, {2, 2, 0, 2, 2}, {2, 0, 2, 2, 2},
			
 
				+	{0, 0, 0, 0, 2}, {1, 0, 0, 0, 2}, {2, 0, 0, 0, 2}, {0, 0, 2, 0, 2},
			
 
				+	{0, 1, 0, 0, 2}, {1, 1, 0, 0, 2}, {2, 1, 0, 0, 2}, {1, 0, 2, 0, 2},
			
 
				+	{0, 2, 0, 0, 2}, {1, 2, 0, 0, 2}, {2, 2, 0, 0, 2}, {2, 0, 2, 0, 2},
			
 
				+	{0, 2, 2, 0, 2}, {1, 2, 2, 0, 2}, {2, 2, 2, 0, 2}, {2, 0, 2, 0, 2},
			
 
				+	{0, 0, 1, 0, 2}, {1, 0, 1, 0, 2}, {2, 0, 1, 0, 2}, {0, 1, 2, 0, 2},
			
 
				+	{0, 1, 1, 0, 2}, {1, 1, 1, 0, 2}, {2, 1, 1, 0, 2}, {1, 1, 2, 0, 2},
			
 
				+	{0, 2, 1, 0, 2}, {1, 2, 1, 0, 2}, {2, 2, 1, 0, 2}, {2, 1, 2, 0, 2},
			
 
				+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 0, 2, 2, 2},
			
 
				+	{0, 0, 0, 0, 1}, {1, 0, 0, 0, 1}, {2, 0, 0, 0, 1}, {0, 0, 2, 0, 1},
			
 
				+	{0, 1, 0, 0, 1}, {1, 1, 0, 0, 1}, {2, 1, 0, 0, 1}, {1, 0, 2, 0, 1},
			
 
				+	{0, 2, 0, 0, 1}, {1, 2, 0, 0, 1}, {2, 2, 0, 0, 1}, {2, 0, 2, 0, 1},
			
 
				+	{0, 2, 2, 0, 1}, {1, 2, 2, 0, 1}, {2, 2, 2, 0, 1}, {2, 0, 2, 0, 1},
			
 
				+	{0, 0, 1, 0, 1}, {1, 0, 1, 0, 1}, {2, 0, 1, 0, 1}, {0, 1, 2, 0, 1},
			
 
				+	{0, 1, 1, 0, 1}, {1, 1, 1, 0, 1}, {2, 1, 1, 0, 1}, {1, 1, 2, 0, 1},
			
 
				+	{0, 2, 1, 0, 1}, {1, 2, 1, 0, 1}, {2, 2, 1, 0, 1}, {2, 1, 2, 0, 1},
			
 
				+	{0, 0, 1, 2, 2}, {1, 0, 1, 2, 2}, {2, 0, 1, 2, 2}, {0, 1, 2, 2, 2},
			
 
				+	{0, 0, 0, 1, 1}, {1, 0, 0, 1, 1}, {2, 0, 0, 1, 1}, {0, 0, 2, 1, 1},
			
 
				+	{0, 1, 0, 1, 1}, {1, 1, 0, 1, 1}, {2, 1, 0, 1, 1}, {1, 0, 2, 1, 1},
			
 
				+	{0, 2, 0, 1, 1}, {1, 2, 0, 1, 1}, {2, 2, 0, 1, 1}, {2, 0, 2, 1, 1},
			
 
				+	{0, 2, 2, 1, 1}, {1, 2, 2, 1, 1}, {2, 2, 2, 1, 1}, {2, 0, 2, 1, 1},
			
 
				+	{0, 0, 1, 1, 1}, {1, 0, 1, 1, 1}, {2, 0, 1, 1, 1}, {0, 1, 2, 1, 1},
			
 
				+	{0, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 1, 1}, {1, 1, 2, 1, 1},
			
 
				+	{0, 2, 1, 1, 1}, {1, 2, 1, 1, 1}, {2, 2, 1, 1, 1}, {2, 1, 2, 1, 1},
			
 
				+	{0, 1, 1, 2, 2}, {1, 1, 1, 2, 2}, {2, 1, 1, 2, 2}, {1, 1, 2, 2, 2},
			
 
				+	{0, 0, 0, 2, 1}, {1, 0, 0, 2, 1}, {2, 0, 0, 2, 1}, {0, 0, 2, 2, 1},
			
 
				+	{0, 1, 0, 2, 1}, {1, 1, 0, 2, 1}, {2, 1, 0, 2, 1}, {1, 0, 2, 2, 1},
			
 
				+	{0, 2, 0, 2, 1}, {1, 2, 0, 2, 1}, {2, 2, 0, 2, 1}, {2, 0, 2, 2, 1},
			
 
				+	{0, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {2, 2, 2, 2, 1}, {2, 0, 2, 2, 1},
			
 
				+	{0, 0, 1, 2, 1}, {1, 0, 1, 2, 1}, {2, 0, 1, 2, 1}, {0, 1, 2, 2, 1},
			
 
				+	{0, 1, 1, 2, 1}, {1, 1, 1, 2, 1}, {2, 1, 1, 2, 1}, {1, 1, 2, 2, 1},
			
 
				+	{0, 2, 1, 2, 1}, {1, 2, 1, 2, 1}, {2, 2, 1, 2, 1}, {2, 1, 2, 2, 1},
			
 
				+	{0, 2, 1, 2, 2}, {1, 2, 1, 2, 2}, {2, 2, 1, 2, 2}, {2, 1, 2, 2, 2},
			
 
				+	{0, 0, 0, 1, 2}, {1, 0, 0, 1, 2}, {2, 0, 0, 1, 2}, {0, 0, 2, 1, 2},
			
 
				+	{0, 1, 0, 1, 2}, {1, 1, 0, 1, 2}, {2, 1, 0, 1, 2}, {1, 0, 2, 1, 2},
			
 
				+	{0, 2, 0, 1, 2}, {1, 2, 0, 1, 2}, {2, 2, 0, 1, 2}, {2, 0, 2, 1, 2},
			
 
				+	{0, 2, 2, 1, 2}, {1, 2, 2, 1, 2}, {2, 2, 2, 1, 2}, {2, 0, 2, 1, 2},
			
 
				+	{0, 0, 1, 1, 2}, {1, 0, 1, 1, 2}, {2, 0, 1, 1, 2}, {0, 1, 2, 1, 2},
			
 
				+	{0, 1, 1, 1, 2}, {1, 1, 1, 1, 2}, {2, 1, 1, 1, 2}, {1, 1, 2, 1, 2},
			
 
				+	{0, 2, 1, 1, 2}, {1, 2, 1, 1, 2}, {2, 2, 1, 1, 2}, {2, 1, 2, 1, 2},
			
 
				+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 1, 2, 2, 2}
			
 
				+};
			
 
				+
			
 
				+/** @brief Packed trit values for each unpacked value, indexed [hi][][][][lo]. */
			
 
				+static const uint8_t integer_of_trits[3][3][3][3][3] {
			
 
				+	{
			
 
				+		{
			
 
				+			{
			
 
				+				{0, 1, 2},
			
 
				+				{4, 5, 6},
			
 
				+				{8, 9, 10}
			
 
				+			},
			
 
				+			{
			
 
				+				{16, 17, 18},
			
 
				+				{20, 21, 22},
			
 
				+				{24, 25, 26}
			
 
				+			},
			
 
				+			{
			
 
				+				{3, 7, 15},
			
 
				+				{19, 23, 27},
			
 
				+				{12, 13, 14}
			
 
				+			}
			
 
				+		},
			
 
				+		{
			
 
				+			{
			
 
				+				{32, 33, 34},
			
 
				+				{36, 37, 38},
			
 
				+				{40, 41, 42}
			
 
				+			},
			
 
				+			{
			
 
				+				{48, 49, 50},
			
 
				+				{52, 53, 54},
			
 
				+				{56, 57, 58}
			
 
				+			},
			
 
				+			{
			
 
				+				{35, 39, 47},
			
 
				+				{51, 55, 59},
			
 
				+				{44, 45, 46}
			
 
				+			}
			
 
				+		},
			
 
				+		{
			
 
				+			{
			
 
				+				{64, 65, 66},
			
 
				+				{68, 69, 70},
			
 
				+				{72, 73, 74}
			
 
				+			},
			
 
				+			{
			
 
				+				{80, 81, 82},
			
 
				+				{84, 85, 86},
			
 
				+				{88, 89, 90}
			
 
				+			},
			
 
				+			{
			
 
				+				{67, 71, 79},
			
 
				+				{83, 87, 91},
			
 
				+				{76, 77, 78}
			
 
				+			}
			
 
				+		}
			
 
				+	},
			
 
				+	{
			
 
				+		{
			
 
				+			{
			
 
				+				{128, 129, 130},
			
 
				+				{132, 133, 134},
			
 
				+				{136, 137, 138}
			
 
				+			},
			
 
				+			{
			
 
				+				{144, 145, 146},
			
 
				+				{148, 149, 150},
			
 
				+				{152, 153, 154}
			
 
				+			},
			
 
				+			{
			
 
				+				{131, 135, 143},
			
 
				+				{147, 151, 155},
			
 
				+				{140, 141, 142}
			
 
				+			}
			
 
				+		},
			
 
				+		{
			
 
				+			{
			
 
				+				{160, 161, 162},
			
 
				+				{164, 165, 166},
			
 
				+				{168, 169, 170}
			
 
				+			},
			
 
				+			{
			
 
				+				{176, 177, 178},
			
 
				+				{180, 181, 182},
			
 
				+				{184, 185, 186}
			
 
				+			},
			
 
				+			{
			
 
				+				{163, 167, 175},
			
 
				+				{179, 183, 187},
			
 
				+				{172, 173, 174}
			
 
				+			}
			
 
				+		},
			
 
				+		{
			
 
				+			{
			
 
				+				{192, 193, 194},
			
 
				+				{196, 197, 198},
			
 
				+				{200, 201, 202}
			
 
				+			},
			
 
				+			{
			
 
				+				{208, 209, 210},
			
 
				+				{212, 213, 214},
			
 
				+				{216, 217, 218}
			
 
				+			},
			
 
				+			{
			
 
				+				{195, 199, 207},
			
 
				+				{211, 215, 219},
			
 
				+				{204, 205, 206}
			
 
				+			}
			
 
				+		}
			
 
				+	},
			
 
				+	{
			
 
				+		{
			
 
				+			{
			
 
				+				{96, 97, 98},
			
 
				+				{100, 101, 102},
			
 
				+				{104, 105, 106}
			
 
				+			},
			
 
				+			{
			
 
				+				{112, 113, 114},
			
 
				+				{116, 117, 118},
			
 
				+				{120, 121, 122}
			
 
				+			},
			
 
				+			{
			
 
				+				{99, 103, 111},
			
 
				+				{115, 119, 123},
			
 
				+				{108, 109, 110}
			
 
				+			}
			
 
				+		},
			
 
				+		{
			
 
				+			{
			
 
				+				{224, 225, 226},
			
 
				+				{228, 229, 230},
			
 
				+				{232, 233, 234}
			
 
				+			},
			
 
				+			{
			
 
				+				{240, 241, 242},
			
 
				+				{244, 245, 246},
			
 
				+				{248, 249, 250}
			
 
				+			},
			
 
				+			{
			
 
				+				{227, 231, 239},
			
 
				+				{243, 247, 251},
			
 
				+				{236, 237, 238}
			
 
				+			}
			
 
				+		},
			
 
				+		{
			
 
				+			{
			
 
				+				{28, 29, 30},
			
 
				+				{60, 61, 62},
			
 
				+				{92, 93, 94}
			
 
				+			},
			
 
				+			{
			
 
				+				{156, 157, 158},
			
 
				+				{188, 189, 190},
			
 
				+				{220, 221, 222}
			
 
				+			},
			
 
				+			{
			
 
				+				{31, 63, 127},
			
 
				+				{159, 191, 255},
			
 
				+				{252, 253, 254}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The number of bits, trits, and quints needed for a quant level.
			
 
				+ */
			
 
				+struct btq_count
			
 
				+{
			
 
				+	/** @brief The number of bits. */
			
 
				+	uint8_t bits:6;
			
 
				+
			
 
				+	/** @brief The number of trits. */
			
 
				+	uint8_t trits:1;
			
 
				+
			
 
				+	/** @brief The number of quints. */
			
 
				+	uint8_t quints:1;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The table of bits, trits, and quints needed for a quant encode.
			
 
				+ */
			
 
				+static const std::array<btq_count, 21> btq_counts {{
			
 
				+	{ 1, 0, 0 }, // QUANT_2
			
 
				+	{ 0, 1, 0 }, // QUANT_3
			
 
				+	{ 2, 0, 0 }, // QUANT_4
			
 
				+	{ 0, 0, 1 }, // QUANT_5
			
 
				+	{ 1, 1, 0 }, // QUANT_6
			
 
				+	{ 3, 0, 0 }, // QUANT_8
			
 
				+	{ 1, 0, 1 }, // QUANT_10
			
 
				+	{ 2, 1, 0 }, // QUANT_12
			
 
				+	{ 4, 0, 0 }, // QUANT_16
			
 
				+	{ 2, 0, 1 }, // QUANT_20
			
 
				+	{ 3, 1, 0 }, // QUANT_24
			
 
				+	{ 5, 0, 0 }, // QUANT_32
			
 
				+	{ 3, 0, 1 }, // QUANT_40
			
 
				+	{ 4, 1, 0 }, // QUANT_48
			
 
				+	{ 6, 0, 0 }, // QUANT_64
			
 
				+	{ 4, 0, 1 }, // QUANT_80
			
 
				+	{ 5, 1, 0 }, // QUANT_96
			
 
				+	{ 7, 0, 0 }, // QUANT_128
			
 
				+	{ 5, 0, 1 }, // QUANT_160
			
 
				+	{ 6, 1, 0 }, // QUANT_192
			
 
				+	{ 8, 0, 0 }  // QUANT_256
			
 
				+}};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The sequence scale, round, and divisors needed to compute sizing.
			
 
				+ *
			
 
				+ * The length of a quantized sequence in bits is:
			
 
				+ *     (scale * <sequence_len> + round) / divisor
			
 
				+ */
			
 
				+struct ise_size
			
 
				+{
			
 
				+	/** @brief The scaling parameter. */
			
 
				+	uint8_t scale:6;
			
 
				+
			
 
				+	/** @brief The divisor parameter. */
			
 
				+	uint8_t divisor:2;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The table of scale, round, and divisors needed for quant sizing.
			
 
				+ */
			
 
				+static const std::array<ise_size, 21> ise_sizes {{
			
 
				+	{  1, 0 }, // QUANT_2
			
 
				+	{  8, 2 }, // QUANT_3
			
 
				+	{  2, 0 }, // QUANT_4
			
 
				+	{  7, 1 }, // QUANT_5
			
 
				+	{ 13, 2 }, // QUANT_6
			
 
				+	{  3, 0 }, // QUANT_8
			
 
				+	{ 10, 1 }, // QUANT_10
			
 
				+	{ 18, 2 }, // QUANT_12
			
 
				+	{  4, 0 }, // QUANT_16
			
 
				+	{ 13, 1 }, // QUANT_20
			
 
				+	{ 23, 2 }, // QUANT_24
			
 
				+	{  5, 0 }, // QUANT_32
			
 
				+	{ 16, 1 }, // QUANT_40
			
 
				+	{ 28, 2 }, // QUANT_48
			
 
				+	{  6, 0 }, // QUANT_64
			
 
				+	{ 19, 1 }, // QUANT_80
			
 
				+	{ 33, 2 }, // QUANT_96
			
 
				+	{  7, 0 }, // QUANT_128
			
 
				+	{ 22, 1 }, // QUANT_160
			
 
				+	{ 38, 2 }, // QUANT_192
			
 
				+	{  8, 0 }  // QUANT_256
			
 
				+}};
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+unsigned int get_ise_sequence_bitcount(
			
 
				+	unsigned int character_count,
			
 
				+	quant_method quant_level
			
 
				+) {
			
 
				+	// Cope with out-of bounds values - input might be invalid
			
 
				+	if (static_cast<size_t>(quant_level) >= ise_sizes.size())
			
 
				+	{
			
 
				+		// Arbitrary large number that's more than an ASTC block can hold
			
 
				+		return 1024;
			
 
				+	}
			
 
				+
			
 
				+	auto& entry = ise_sizes[quant_level];
			
 
				+	unsigned int divisor = (entry.divisor << 1) + 1;
			
 
				+	return (entry.scale * character_count + divisor - 1) / divisor;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Write up to 8 bits at an arbitrary bit offset.
			
 
				+ *
			
 
				+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
			
 
				+ * span two separate bytes in memory.
			
 
				+ *
			
 
				+ * @param         value       The value to write.
			
 
				+ * @param         bitcount    The number of bits to write, starting from LSB.
			
 
				+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
			
 
				+ * @param[in,out] ptr         The data pointer to write to.
			
 
				+ */
			
 
				+static inline void write_bits(
			
 
				+	unsigned int value,
			
 
				+	unsigned int bitcount,
			
 
				+	unsigned int bitoffset,
			
 
				+	uint8_t ptr[2]
			
 
				+) {
			
 
				+	unsigned int mask = (1 << bitcount) - 1;
			
 
				+	value &= mask;
			
 
				+	ptr += bitoffset >> 3;
			
 
				+	bitoffset &= 7;
			
 
				+	value <<= bitoffset;
			
 
				+	mask <<= bitoffset;
			
 
				+	mask = ~mask;
			
 
				+
			
 
				+	ptr[0] &= mask;
			
 
				+	ptr[0] |= value;
			
 
				+	ptr[1] &= mask >> 8;
			
 
				+	ptr[1] |= value >> 8;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Read up to 8 bits at an arbitrary bit offset.
			
 
				+ *
			
 
				+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
			
 
				+ * span two separate bytes in memory.
			
 
				+ *
			
 
				+ * @param         bitcount    The number of bits to read.
			
 
				+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
			
 
				+ * @param[in,out] ptr         The data pointer to read from.
			
 
				+ *
			
 
				+ * @return The read value.
			
 
				+ */
			
 
				+static inline unsigned int read_bits(
			
 
				+	unsigned int bitcount,
			
 
				+	unsigned int bitoffset,
			
 
				+	const uint8_t* ptr
			
 
				+) {
			
 
				+	unsigned int mask = (1 << bitcount) - 1;
			
 
				+	ptr += bitoffset >> 3;
			
 
				+	bitoffset &= 7;
			
 
				+	unsigned int value = ptr[0] | (ptr[1] << 8);
			
 
				+	value >>= bitoffset;
			
 
				+	value &= mask;
			
 
				+	return value;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void encode_ise(
			
 
				+	quant_method quant_level,
			
 
				+	unsigned int character_count,
			
 
				+	const uint8_t* input_data,
			
 
				+	uint8_t* output_data,
			
 
				+	unsigned int bit_offset
			
 
				+) {
			
 
				+	promise(character_count > 0);
			
 
				+
			
 
				+	unsigned int bits = btq_counts[quant_level].bits;
			
 
				+	unsigned int trits = btq_counts[quant_level].trits;
			
 
				+	unsigned int quints = btq_counts[quant_level].quints;
			
 
				+	unsigned int mask = (1 << bits) - 1;
			
 
				+
			
 
				+	// Write out trits and bits
			
 
				+	if (trits)
			
 
				+	{
			
 
				+		unsigned int i = 0;
			
 
				+		unsigned int full_trit_blocks = character_count / 5;
			
 
				+
			
 
				+		for (unsigned int j = 0; j < full_trit_blocks; j++)
			
 
				+		{
			
 
				+			unsigned int i4 = input_data[i + 4] >> bits;
			
 
				+			unsigned int i3 = input_data[i + 3] >> bits;
			
 
				+			unsigned int i2 = input_data[i + 2] >> bits;
			
 
				+			unsigned int i1 = input_data[i + 1] >> bits;
			
 
				+			unsigned int i0 = input_data[i + 0] >> bits;
			
 
				+
			
 
				+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
			
 
				+
			
 
				+			// The max size of a trit bit count is 6, so we can always safely
			
 
				+			// pack a single MX value with the following 1 or 2 T bits.
			
 
				+			uint8_t pack;
			
 
				+
			
 
				+			// Element 0 + T0 + T1
			
 
				+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x3) << bits);
			
 
				+			write_bits(pack, bits + 2, bit_offset, output_data);
			
 
				+			bit_offset += bits + 2;
			
 
				+
			
 
				+			// Element 1 + T2 + T3
			
 
				+			pack = (input_data[i++] & mask) | (((T >> 2) & 0x3) << bits);
			
 
				+			write_bits(pack, bits + 2, bit_offset, output_data);
			
 
				+			bit_offset += bits + 2;
			
 
				+
			
 
				+			// Element 2 + T4
			
 
				+			pack = (input_data[i++] & mask) | (((T >> 4) & 0x1) << bits);
			
 
				+			write_bits(pack, bits + 1, bit_offset, output_data);
			
 
				+			bit_offset += bits + 1;
			
 
				+
			
 
				+			// Element 3 + T5 + T6
			
 
				+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
			
 
				+			write_bits(pack, bits + 2, bit_offset, output_data);
			
 
				+			bit_offset += bits + 2;
			
 
				+
			
 
				+			// Element 4 + T7
			
 
				+			pack = (input_data[i++] & mask) | (((T >> 7) & 0x1) << bits);
			
 
				+			write_bits(pack, bits + 1, bit_offset, output_data);
			
 
				+			bit_offset += bits + 1;
			
 
				+		}
			
 
				+
			
 
				+		// Loop tail for a partial block
			
 
				+		if (i != character_count)
			
 
				+		{
			
 
				+			// i4 cannot be present - we know the block is partial
			
 
				+			// i0 must be present - we know the block isn't empty
			
 
				+			unsigned int i4 =                            0;
			
 
				+			unsigned int i3 = i + 3 >= character_count ? 0 : input_data[i + 3] >> bits;
			
 
				+			unsigned int i2 = i + 2 >= character_count ? 0 : input_data[i + 2] >> bits;
			
 
				+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
			
 
				+			unsigned int i0 =                                input_data[i + 0] >> bits;
			
 
				+
			
 
				+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
			
 
				+
			
 
				+			for (unsigned int j = 0; i < character_count; i++, j++)
			
 
				+			{
			
 
				+				// Truncated table as this iteration is always partital
			
 
				+				static const uint8_t tbits[4]  { 2, 2, 1, 2 };
			
 
				+				static const uint8_t tshift[4] { 0, 2, 4, 5 };
			
 
				+
			
 
				+				uint8_t pack = (input_data[i] & mask) |
			
 
				+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
			
 
				+
			
 
				+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
			
 
				+				bit_offset += bits + tbits[j];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	// Write out quints and bits
			
 
				+	else if (quints)
			
 
				+	{
			
 
				+		unsigned int i = 0;
			
 
				+		unsigned int full_quint_blocks = character_count / 3;
			
 
				+
			
 
				+		for (unsigned int j = 0; j < full_quint_blocks; j++)
			
 
				+		{
			
 
				+			unsigned int i2 = input_data[i + 2] >> bits;
			
 
				+			unsigned int i1 = input_data[i + 1] >> bits;
			
 
				+			unsigned int i0 = input_data[i + 0] >> bits;
			
 
				+
			
 
				+			uint8_t T = integer_of_quints[i2][i1][i0];
			
 
				+
			
 
				+			// The max size of a quint bit count is 5, so we can always safely
			
 
				+			// pack a single M value with the following 2 or 3 T bits.
			
 
				+			uint8_t pack;
			
 
				+
			
 
				+			// Element 0
			
 
				+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x7) << bits);
			
 
				+			write_bits(pack, bits + 3, bit_offset, output_data);
			
 
				+			bit_offset += bits + 3;
			
 
				+
			
 
				+			// Element 1
			
 
				+			pack = (input_data[i++] & mask) | (((T >> 3) & 0x3) << bits);
			
 
				+			write_bits(pack, bits + 2, bit_offset, output_data);
			
 
				+			bit_offset += bits + 2;
			
 
				+
			
 
				+			// Element 2
			
 
				+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
			
 
				+			write_bits(pack, bits + 2, bit_offset, output_data);
			
 
				+			bit_offset += bits + 2;
			
 
				+		}
			
 
				+
			
 
				+		// Loop tail for a partial block
			
 
				+		if (i != character_count)
			
 
				+		{
			
 
				+			// i2 cannot be present - we know the block is partial
			
 
				+			// i0 must be present - we know the block isn't empty
			
 
				+			unsigned int i2 =                            0;
			
 
				+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
			
 
				+			unsigned int i0 =                                input_data[i + 0] >> bits;
			
 
				+
			
 
				+			uint8_t T = integer_of_quints[i2][i1][i0];
			
 
				+
			
 
				+			for (unsigned int j = 0; i < character_count; i++, j++)
			
 
				+			{
			
 
				+				// Truncated table as this iteration is always partital
			
 
				+				static const uint8_t tbits[2]  { 3, 2 };
			
 
				+				static const uint8_t tshift[2] { 0, 3 };
			
 
				+
			
 
				+				uint8_t pack = (input_data[i] & mask) |
			
 
				+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
			
 
				+
			
 
				+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
			
 
				+				bit_offset += bits + tbits[j];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	// Write out just bits
			
 
				+	else
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < character_count; i++)
			
 
				+		{
			
 
				+			write_bits(input_data[i], bits, bit_offset, output_data);
			
 
				+			bit_offset += bits;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void decode_ise(
			
 
				+	quant_method quant_level,
			
 
				+	unsigned int character_count,
			
 
				+	const uint8_t* input_data,
			
 
				+	uint8_t* output_data,
			
 
				+	unsigned int bit_offset
			
 
				+) {
			
 
				+	promise(character_count > 0);
			
 
				+
			
 
				+	// Note: due to how the trit/quint-block unpacking is done in this function, we may write more
			
 
				+	// temporary results than the number of outputs. The maximum actual number of results is 64 bit,
			
 
				+	// but we keep 4 additional character_count of padding.
			
 
				+	uint8_t results[68];
			
 
				+	uint8_t tq_blocks[22] { 0 }; // Trit-blocks or quint-blocks, must be zeroed
			
 
				+
			
 
				+	unsigned int bits = btq_counts[quant_level].bits;
			
 
				+	unsigned int trits = btq_counts[quant_level].trits;
			
 
				+	unsigned int quints = btq_counts[quant_level].quints;
			
 
				+
			
 
				+	unsigned int lcounter = 0;
			
 
				+	unsigned int hcounter = 0;
			
 
				+
			
 
				+	// Collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
			
 
				+	for (unsigned int i = 0; i < character_count; i++)
			
 
				+	{
			
 
				+		results[i] = static_cast<uint8_t>(read_bits(bits, bit_offset, input_data));
			
 
				+		bit_offset += bits;
			
 
				+
			
 
				+		if (trits)
			
 
				+		{
			
 
				+			static const uint8_t bits_to_read[5]  { 2, 2, 1, 2, 1 };
			
 
				+			static const uint8_t block_shift[5]   { 0, 2, 4, 5, 7 };
			
 
				+			static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
			
 
				+			static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
			
 
				+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
			
 
				+			bit_offset += bits_to_read[lcounter];
			
 
				+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
			
 
				+			hcounter += hcounter_incr[lcounter];
			
 
				+			lcounter = next_lcounter[lcounter];
			
 
				+		}
			
 
				+
			
 
				+		if (quints)
			
 
				+		{
			
 
				+			static const uint8_t bits_to_read[3]  { 3, 2, 2 };
			
 
				+			static const uint8_t block_shift[3]   { 0, 3, 5 };
			
 
				+			static const uint8_t next_lcounter[3] { 1, 2, 0 };
			
 
				+			static const uint8_t hcounter_incr[3] { 0, 0, 1 };
			
 
				+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
			
 
				+			bit_offset += bits_to_read[lcounter];
			
 
				+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
			
 
				+			hcounter += hcounter_incr[lcounter];
			
 
				+			lcounter = next_lcounter[lcounter];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Unpack trit-blocks or quint-blocks as needed
			
 
				+	if (trits)
			
 
				+	{
			
 
				+		unsigned int trit_blocks = (character_count + 4) / 5;
			
 
				+		promise(trit_blocks > 0);
			
 
				+		for (unsigned int i = 0; i < trit_blocks; i++)
			
 
				+		{
			
 
				+			const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
			
 
				+			results[5 * i    ] |= tritptr[0] << bits;
			
 
				+			results[5 * i + 1] |= tritptr[1] << bits;
			
 
				+			results[5 * i + 2] |= tritptr[2] << bits;
			
 
				+			results[5 * i + 3] |= tritptr[3] << bits;
			
 
				+			results[5 * i + 4] |= tritptr[4] << bits;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (quints)
			
 
				+	{
			
 
				+		unsigned int quint_blocks = (character_count + 2) / 3;
			
 
				+		promise(quint_blocks > 0);
			
 
				+		for (unsigned int i = 0; i < quint_blocks; i++)
			
 
				+		{
			
 
				+			const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
			
 
				+			results[3 * i    ] |= quintptr[0] << bits;
			
 
				+			results[3 * i + 1] |= quintptr[1] << bits;
			
 
				+			results[3 * i + 2] |= quintptr[2] << bits;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int i = 0; i < character_count; i++)
			
 
				+	{
			
 
				+		output_data[i] = results[i];
			
 
				+	}
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_internal.h
+++ b/thirdparty/astcenc/astcenc_internal.h
@@ -0,0 +1,2196 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions and data declarations.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTCENC_INTERNAL_INCLUDED
			
 
				+#define ASTCENC_INTERNAL_INCLUDED
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <cstddef>
			
 
				+#include <cstdint>
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	#include <cstdio>
			
 
				+#endif
			
 
				+#include <cstdlib>
			
 
				+
			
 
				+#include "astcenc.h"
			
 
				+#include "astcenc_mathlib.h"
			
 
				+#include "astcenc_vecmathlib.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Make a promise to the compiler's optimizer.
			
 
				+ *
			
 
				+ * A promise is an expression that the optimizer is can assume is true for to help it generate
			
 
				+ * faster code. Common use cases for this are to promise that a for loop will iterate more than
			
 
				+ * once, or that the loop iteration count is a multiple of a vector length, which avoids pre-loop
			
 
				+ * checks and can avoid loop tails if loops are unrolled by the auto-vectorizer.
			
 
				+ */
			
 
				+#if defined(NDEBUG)
			
 
				+	#if !defined(__clang__) && defined(_MSC_VER)
			
 
				+		#define promise(cond) __assume(cond)
			
 
				+	#elif defined(__clang__)
			
 
				+		#if __has_builtin(__builtin_assume)
			
 
				+			#define promise(cond) __builtin_assume(cond)
			
 
				+		#elif __has_builtin(__builtin_unreachable)
			
 
				+			#define promise(cond) if (!(cond)) { __builtin_unreachable(); }
			
 
				+		#else
			
 
				+			#define promise(cond)
			
 
				+		#endif
			
 
				+	#else // Assume GCC
			
 
				+		#define promise(cond) if (!(cond)) { __builtin_unreachable(); }
			
 
				+	#endif
			
 
				+#else
			
 
				+	#define promise(cond) assert(cond)
			
 
				+#endif
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Constants
			
 
				+============================================================================ */
			
 
				+#if !defined(ASTCENC_BLOCK_MAX_TEXELS)
			
 
				+	#define ASTCENC_BLOCK_MAX_TEXELS 216 // A 3D 6x6x6 block
			
 
				+#endif
			
 
				+
			
 
				+/** @brief The maximum number of texels a block can support (6x6x6 block). */
			
 
				+static constexpr unsigned int BLOCK_MAX_TEXELS { ASTCENC_BLOCK_MAX_TEXELS };
			
 
				+
			
 
				+/** @brief The maximum number of components a block can support. */
			
 
				+static constexpr unsigned int BLOCK_MAX_COMPONENTS { 4 };
			
 
				+
			
 
				+/** @brief The maximum number of partitions a block can support. */
			
 
				+static constexpr unsigned int BLOCK_MAX_PARTITIONS { 4 };
			
 
				+
			
 
				+/** @brief The number of partitionings, per partition count, suported by the ASTC format. */
			
 
				+static constexpr unsigned int BLOCK_MAX_PARTITIONINGS { 1024 };
			
 
				+
			
 
				+/** @brief The maximum number of weights used during partition selection for texel clustering. */
			
 
				+static constexpr uint8_t BLOCK_MAX_KMEANS_TEXELS { 64 };
			
 
				+
			
 
				+/** @brief The maximum number of weights a block can support. */
			
 
				+static constexpr unsigned int BLOCK_MAX_WEIGHTS { 64 };
			
 
				+
			
 
				+/** @brief The maximum number of weights a block can support per plane in 2 plane mode. */
			
 
				+static constexpr unsigned int BLOCK_MAX_WEIGHTS_2PLANE { BLOCK_MAX_WEIGHTS / 2 };
			
 
				+
			
 
				+/** @brief The minimum number of weight bits a candidate encoding must encode. */
			
 
				+static constexpr unsigned int BLOCK_MIN_WEIGHT_BITS { 24 };
			
 
				+
			
 
				+/** @brief The maximum number of weight bits a candidate encoding can encode. */
			
 
				+static constexpr unsigned int BLOCK_MAX_WEIGHT_BITS { 96 };
			
 
				+
			
 
				+/** @brief The index indicating a bad (unused) block mode in the remap array. */
			
 
				+static constexpr uint16_t BLOCK_BAD_BLOCK_MODE { 0xFFFFu };
			
 
				+
			
 
				+/** @brief The index indicating a bad (unused) partitioning in the remap array. */
			
 
				+static constexpr uint16_t BLOCK_BAD_PARTITIONING { 0xFFFFu };
			
 
				+
			
 
				+/** @brief The number of partition index bits supported by the ASTC format . */
			
 
				+static constexpr unsigned int PARTITION_INDEX_BITS { 10 };
			
 
				+
			
 
				+/** @brief The offset of the plane 2 weights in shared weight arrays. */
			
 
				+static constexpr unsigned int WEIGHTS_PLANE2_OFFSET { BLOCK_MAX_WEIGHTS_2PLANE };
			
 
				+
			
 
				+/** @brief The sum of quantized weights for one texel. */
			
 
				+static constexpr float WEIGHTS_TEXEL_SUM { 16.0f };
			
 
				+
			
 
				+/** @brief The number of block modes supported by the ASTC format. */
			
 
				+static constexpr unsigned int WEIGHTS_MAX_BLOCK_MODES { 2048 };
			
 
				+
			
 
				+/** @brief The number of weight grid decimation modes supported by the ASTC format. */
			
 
				+static constexpr unsigned int WEIGHTS_MAX_DECIMATION_MODES { 87 };
			
 
				+
			
 
				+/** @brief The high default error used to initialize error trackers. */
			
 
				+static constexpr float ERROR_CALC_DEFAULT { 1e30f };
			
 
				+
			
 
				+/**
			
 
				+ * @brief The minimum texel count for a block to use the one partition fast path.
			
 
				+ *
			
 
				+ * This setting skips 4x4 and 5x4 block sizes.
			
 
				+ */
			
 
				+static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 };
			
 
				+
			
 
				+/**
			
 
				+ * @brief The maximum number of candidate encodings tested for each encoding mode.
			
 
				+ *
			
 
				+ * This can be dynamically reduced by the compression quality preset.
			
 
				+ */
			
 
				+static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 };
			
 
				+
			
 
				+/**
			
 
				+ * @brief The maximum number of candidate partitionings tested for each encoding mode.
			
 
				+ *
			
 
				+ * This can be dynamically reduced by the compression quality preset.
			
 
				+ */
			
 
				+static constexpr unsigned int TUNE_MAX_PARTITIONING_CANDIDATES { 32 };
			
 
				+
			
 
				+/**
			
 
				+ * @brief The maximum quant level using full angular endpoint search method.
			
 
				+ *
			
 
				+ * The angular endpoint search is used to find the min/max weight that should
			
 
				+ * be used for a given quantization level. It is effective but expensive, so
			
 
				+ * we only use it where it has the most value - low quant levels with wide
			
 
				+ * spacing. It is used below TUNE_MAX_ANGULAR_QUANT (inclusive). Above this we
			
 
				+ * assume the min weight is 0.0f, and the max weight is 1.0f.
			
 
				+ *
			
 
				+ * Note the angular algorithm is vectorized, and using QUANT_12 exactly fills
			
 
				+ * one 8-wide vector. Decreasing by one doesn't buy much performance, and
			
 
				+ * increasing by one is disproportionately expensive.
			
 
				+ */
			
 
				+static constexpr unsigned int TUNE_MAX_ANGULAR_QUANT { 7 }; /* QUANT_12 */
			
 
				+
			
 
				+static_assert((BLOCK_MAX_TEXELS % ASTCENC_SIMD_WIDTH) == 0,
			
 
				+              "BLOCK_MAX_TEXELS must be multiple of ASTCENC_SIMD_WIDTH");
			
 
				+
			
 
				+static_assert(BLOCK_MAX_TEXELS <= 216,
			
 
				+              "BLOCK_MAX_TEXELS must not be greater than 216");
			
 
				+
			
 
				+static_assert((BLOCK_MAX_WEIGHTS % ASTCENC_SIMD_WIDTH) == 0,
			
 
				+              "BLOCK_MAX_WEIGHTS must be multiple of ASTCENC_SIMD_WIDTH");
			
 
				+
			
 
				+static_assert((WEIGHTS_MAX_BLOCK_MODES % ASTCENC_SIMD_WIDTH) == 0,
			
 
				+              "WEIGHTS_MAX_BLOCK_MODES must be multiple of ASTCENC_SIMD_WIDTH");
			
 
				+
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Commonly used data structures
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief The ASTC endpoint formats.
			
 
				+ *
			
 
				+ * Note, the values here are used directly in the encoding in the format so do not rearrange.
			
 
				+ */
			
 
				+enum endpoint_formats
			
 
				+{
			
 
				+	FMT_LUMINANCE = 0,
			
 
				+	FMT_LUMINANCE_DELTA = 1,
			
 
				+	FMT_HDR_LUMINANCE_LARGE_RANGE = 2,
			
 
				+	FMT_HDR_LUMINANCE_SMALL_RANGE = 3,
			
 
				+	FMT_LUMINANCE_ALPHA = 4,
			
 
				+	FMT_LUMINANCE_ALPHA_DELTA = 5,
			
 
				+	FMT_RGB_SCALE = 6,
			
 
				+	FMT_HDR_RGB_SCALE = 7,
			
 
				+	FMT_RGB = 8,
			
 
				+	FMT_RGB_DELTA = 9,
			
 
				+	FMT_RGB_SCALE_ALPHA = 10,
			
 
				+	FMT_HDR_RGB = 11,
			
 
				+	FMT_RGBA = 12,
			
 
				+	FMT_RGBA_DELTA = 13,
			
 
				+	FMT_HDR_RGB_LDR_ALPHA = 14,
			
 
				+	FMT_HDR_RGBA = 15
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The ASTC quantization methods.
			
 
				+ *
			
 
				+ * Note, the values here are used directly in the encoding in the format so do not rearrange.
			
 
				+ */
			
 
				+enum quant_method
			
 
				+{
			
 
				+	QUANT_2 = 0,
			
 
				+	QUANT_3 = 1,
			
 
				+	QUANT_4 = 2,
			
 
				+	QUANT_5 = 3,
			
 
				+	QUANT_6 = 4,
			
 
				+	QUANT_8 = 5,
			
 
				+	QUANT_10 = 6,
			
 
				+	QUANT_12 = 7,
			
 
				+	QUANT_16 = 8,
			
 
				+	QUANT_20 = 9,
			
 
				+	QUANT_24 = 10,
			
 
				+	QUANT_32 = 11,
			
 
				+	QUANT_40 = 12,
			
 
				+	QUANT_48 = 13,
			
 
				+	QUANT_64 = 14,
			
 
				+	QUANT_80 = 15,
			
 
				+	QUANT_96 = 16,
			
 
				+	QUANT_128 = 17,
			
 
				+	QUANT_160 = 18,
			
 
				+	QUANT_192 = 19,
			
 
				+	QUANT_256 = 20
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The number of levels use by an ASTC quantization method.
			
 
				+ *
			
 
				+ * @param method   The quantization method
			
 
				+ *
			
 
				+ * @return   The number of levels used by @c method.
			
 
				+ */
			
 
				+static inline unsigned int get_quant_level(quant_method method)
			
 
				+{
			
 
				+	switch (method)
			
 
				+	{
			
 
				+	case QUANT_2:   return   2;
			
 
				+	case QUANT_3:   return   3;
			
 
				+	case QUANT_4:   return   4;
			
 
				+	case QUANT_5:   return   5;
			
 
				+	case QUANT_6:   return   6;
			
 
				+	case QUANT_8:   return   8;
			
 
				+	case QUANT_10:  return  10;
			
 
				+	case QUANT_12:  return  12;
			
 
				+	case QUANT_16:  return  16;
			
 
				+	case QUANT_20:  return  20;
			
 
				+	case QUANT_24:  return  24;
			
 
				+	case QUANT_32:  return  32;
			
 
				+	case QUANT_40:  return  40;
			
 
				+	case QUANT_48:  return  48;
			
 
				+	case QUANT_64:  return  64;
			
 
				+	case QUANT_80:  return  80;
			
 
				+	case QUANT_96:  return  96;
			
 
				+	case QUANT_128: return 128;
			
 
				+	case QUANT_160: return 160;
			
 
				+	case QUANT_192: return 192;
			
 
				+	case QUANT_256: return 256;
			
 
				+	}
			
 
				+
			
 
				+	// Unreachable - the enum is fully described
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computed metrics about a partition in a block.
			
 
				+ */
			
 
				+struct partition_metrics
			
 
				+{
			
 
				+	/** @brief The error-weighted average color in the partition. */
			
 
				+	vfloat4 avg;
			
 
				+
			
 
				+	/** @brief The dominant error-weighted direction in the partition. */
			
 
				+	vfloat4 dir;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computed lines for a a three component analysis.
			
 
				+ */
			
 
				+struct partition_lines3
			
 
				+{
			
 
				+	/** @brief Line for uncorrelated chroma. */
			
 
				+	line3 uncor_line;
			
 
				+
			
 
				+	/** @brief Line for correlated chroma, passing though the origin. */
			
 
				+	line3 samec_line;
			
 
				+
			
 
				+	/** @brief Post-processed line for uncorrelated chroma. */
			
 
				+	processed_line3 uncor_pline;
			
 
				+
			
 
				+	/** @brief Post-processed line for correlated chroma, passing though the origin. */
			
 
				+	processed_line3 samec_pline;
			
 
				+
			
 
				+	/** @brief The length of the line for uncorrelated chroma. */
			
 
				+	float uncor_line_len;
			
 
				+
			
 
				+	/** @brief The length of the line for correlated chroma. */
			
 
				+	float samec_line_len;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The partition information for a single partition.
			
 
				+ *
			
 
				+ * ASTC has a total of 1024 candidate partitions for each of 2/3/4 partition counts, although this
			
 
				+ * 1024 includes seeds that generate duplicates of other seeds and seeds that generate completely
			
 
				+ * empty partitions. These are both valid encodings, but astcenc will skip both during compression
			
 
				+ * as they are not useful.
			
 
				+ */
			
 
				+struct partition_info
			
 
				+{
			
 
				+	/** @brief The number of partitions in this partitioning. */
			
 
				+	uint16_t partition_count;
			
 
				+
			
 
				+	/** @brief The index (seed) of this partitioning. */
			
 
				+	uint16_t partition_index;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of texels in each partition.
			
 
				+	 *
			
 
				+	 * Note that some seeds result in zero texels assigned to a partition are valid, but are skipped
			
 
				+	 * by this compressor as there is no point spending bits encoding an unused color endpoint.
			
 
				+	 */
			
 
				+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	/** @brief The partition of each texel in the block. */
			
 
				+	uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief The list of texels in each partition. */
			
 
				+	uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The weight grid information for a single decimation pattern.
			
 
				+ *
			
 
				+ * ASTC can store one weight per texel, but is also capable of storing lower resolution weight grids
			
 
				+ * that are interpolated during decompression to assign a with to a texel. Storing fewer weights
			
 
				+ * can free up a substantial amount of bits that we can then spend on more useful things, such as
			
 
				+ * more accurate endpoints and weights, or additional partitions.
			
 
				+ *
			
 
				+ * This data structure is used to store information about a single weight grid decimation pattern,
			
 
				+ * for a single block size.
			
 
				+ */
			
 
				+struct decimation_info
			
 
				+{
			
 
				+	/** @brief The total number of texels in the block. */
			
 
				+	uint8_t texel_count;
			
 
				+
			
 
				+	/** @brief The maximum number of stored weights that contribute to each texel, between 1 and 4. */
			
 
				+	uint8_t max_texel_weight_count;
			
 
				+
			
 
				+	/** @brief The total number of weights stored. */
			
 
				+	uint8_t weight_count;
			
 
				+
			
 
				+	/** @brief The number of stored weights in the X dimension. */
			
 
				+	uint8_t weight_x;
			
 
				+
			
 
				+	/** @brief The number of stored weights in the Y dimension. */
			
 
				+	uint8_t weight_y;
			
 
				+
			
 
				+	/** @brief The number of stored weights in the Z dimension. */
			
 
				+	uint8_t weight_z;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of weights that contribute to each texel.
			
 
				+	 * Value is between 1 and 4.
			
 
				+	 */
			
 
				+	uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The weight index of the N weights that are interpolated for each texel.
			
 
				+	 * Stored transposed to improve vectorization.
			
 
				+	 */
			
 
				+	uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
			
 
				+	 * Value is between 0 and 16, stored transposed to improve vectorization.
			
 
				+	 */
			
 
				+	uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
			
 
				+	 * Value is between 0 and 1, stored transposed to improve vectorization.
			
 
				+	 */
			
 
				+	alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief The number of texels that each stored weight contributes to. */
			
 
				+	uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The list of texels that use a specific weight index.
			
 
				+	 * Stored transposed to improve vectorization.
			
 
				+	 */
			
 
				+	uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The bilinear contribution to the N texels that use each weight.
			
 
				+	 * Value is between 0 and 1, stored transposed to improve vectorization.
			
 
				+	 */
			
 
				+	alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The bilinear contribution to the Nth texel that uses each weight.
			
 
				+	 * Value is between 0 and 1, stored transposed to improve vectorization.
			
 
				+	 */
			
 
				+	float texel_contrib_for_weight[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Metadata for single block mode for a specific block size.
			
 
				+ */
			
 
				+struct block_mode
			
 
				+{
			
 
				+	/** @brief The block mode index in the ASTC encoded form. */
			
 
				+	uint16_t mode_index;
			
 
				+
			
 
				+	/** @brief The decimation mode index in the compressor reindexed list. */
			
 
				+	uint8_t decimation_mode;
			
 
				+
			
 
				+	/** @brief The weight quantization used by this block mode. */
			
 
				+	uint8_t quant_mode;
			
 
				+
			
 
				+	/** @brief The weight quantization used by this block mode. */
			
 
				+	uint8_t weight_bits;
			
 
				+
			
 
				+	/** @brief Is a dual weight plane used by this block mode? */
			
 
				+	uint8_t is_dual_plane : 1;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the weight quantization used by this block mode.
			
 
				+	 *
			
 
				+	 * @return The quantization level.
			
 
				+	 */
			
 
				+	inline quant_method get_weight_quant_mode() const
			
 
				+	{
			
 
				+		return static_cast<quant_method>(this->quant_mode);
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Metadata for single decimation mode for a specific block size.
			
 
				+ */
			
 
				+struct decimation_mode
			
 
				+{
			
 
				+	/** @brief The max weight precision for 1 plane, or -1 if not supported. */
			
 
				+	int8_t maxprec_1plane;
			
 
				+
			
 
				+	/** @brief The max weight precision for 2 planes, or -1 if not supported. */
			
 
				+	int8_t maxprec_2planes;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Bitvector indicating weight quant modes used by active 1 plane block modes.
			
 
				+	 *
			
 
				+	 * Bit 0 = QUANT_2, Bit 1 = QUANT_3, etc.
			
 
				+	 */
			
 
				+	uint16_t refprec_1_plane;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Bitvector indicating weight quant methods used by active 2 plane block modes.
			
 
				+	 *
			
 
				+	 * Bit 0 = QUANT_2, Bit 1 = QUANT_3, etc.
			
 
				+	 */
			
 
				+	uint16_t refprec_2_planes;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Set a 1 plane weight quant as active.
			
 
				+	 *
			
 
				+	 * @param weight_quant   The quant method to set.
			
 
				+	 */
			
 
				+	void set_ref_1_plane(quant_method weight_quant)
			
 
				+	{
			
 
				+		refprec_1_plane |= (1 << weight_quant);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Test if this mode is active below a given 1 plane weight quant (inclusive).
			
 
				+	 *
			
 
				+	 * @param max_weight_quant   The max quant method to test.
			
 
				+	 */
			
 
				+	bool is_ref_1_plane(quant_method max_weight_quant) const
			
 
				+	{
			
 
				+		uint16_t mask = static_cast<uint16_t>((1 << (max_weight_quant + 1)) - 1);
			
 
				+		return (refprec_1_plane & mask) != 0;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Set a 2 plane weight quant as active.
			
 
				+	 *
			
 
				+	 * @param weight_quant   The quant method to set.
			
 
				+	 */
			
 
				+	void set_ref_2_plane(quant_method weight_quant)
			
 
				+	{
			
 
				+		refprec_2_planes |= static_cast<uint16_t>(1 << weight_quant);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Test if this mode is active below a given 2 plane weight quant (inclusive).
			
 
				+	 *
			
 
				+	 * @param max_weight_quant   The max quant method to test.
			
 
				+	 */
			
 
				+	bool is_ref_2_plane(quant_method max_weight_quant) const
			
 
				+	{
			
 
				+		uint16_t mask = static_cast<uint16_t>((1 << (max_weight_quant + 1)) - 1);
			
 
				+		return (refprec_2_planes & mask) != 0;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data tables for a single block size.
			
 
				+ *
			
 
				+ * The decimation tables store the information to apply weight grid dimension reductions. We only
			
 
				+ * store the decimation modes that are actually needed by the current context; many of the possible
			
 
				+ * modes will be unused (too many weights for the current block size or disabled by heuristics). The
			
 
				+ * actual number of weights stored is @c decimation_mode_count, and the @c decimation_modes and
			
 
				+ * @c decimation_tables arrays store the active modes contiguously at the start of the array. These
			
 
				+ * entries are not stored in any particular order.
			
 
				+ *
			
 
				+ * The block mode tables store the unpacked block mode settings. Block modes are stored in the
			
 
				+ * compressed block as an 11 bit field, but for any given block size and set of compressor
			
 
				+ * heuristics, only a subset of the block modes will be used. The actual number of block modes
			
 
				+ * stored is indicated in @c block_mode_count, and the @c block_modes array store the active modes
			
 
				+ * contiguously at the start of the array. These entries are stored in incrementing "packed" value
			
 
				+ * order, which doesn't mean much once unpacked. To allow decompressors to reference the packed data
			
 
				+ * efficiently the @c block_mode_packed_index array stores the mapping between physical ID and the
			
 
				+ * actual remapped array index.
			
 
				+ */
			
 
				+struct block_size_descriptor
			
 
				+{
			
 
				+	/** @brief The block X dimension, in texels. */
			
 
				+	uint8_t xdim;
			
 
				+
			
 
				+	/** @brief The block Y dimension, in texels. */
			
 
				+	uint8_t ydim;
			
 
				+
			
 
				+	/** @brief The block Z dimension, in texels. */
			
 
				+	uint8_t zdim;
			
 
				+
			
 
				+	/** @brief The block total texel count. */
			
 
				+	uint8_t texel_count;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of stored decimation modes which are "always" modes.
			
 
				+	 *
			
 
				+	 * Always modes are stored at the start of the decimation_modes list.
			
 
				+	 */
			
 
				+	unsigned int decimation_mode_count_always;
			
 
				+
			
 
				+	/** @brief The number of stored decimation modes for selected encodings. */
			
 
				+	unsigned int decimation_mode_count_selected;
			
 
				+
			
 
				+	/** @brief The number of stored decimation modes for any encoding. */
			
 
				+	unsigned int decimation_mode_count_all;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The number of stored block modes which are "always" modes.
			
 
				+	 *
			
 
				+	 * Always modes are stored at the start of the block_modes list.
			
 
				+	 */
			
 
				+	unsigned int block_mode_count_1plane_always;
			
 
				+
			
 
				+	/** @brief The number of stored block modes for active 1 plane encodings. */
			
 
				+	unsigned int block_mode_count_1plane_selected;
			
 
				+
			
 
				+	/** @brief The number of stored block modes for active 1 and 2 plane encodings. */
			
 
				+	unsigned int block_mode_count_1plane_2plane_selected;
			
 
				+
			
 
				+	/** @brief The number of stored block modes for any encoding. */
			
 
				+	unsigned int block_mode_count_all;
			
 
				+
			
 
				+	/** @brief The number of selected partitionings for 1/2/3/4 partitionings. */
			
 
				+	unsigned int partitioning_count_selected[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	/** @brief The number of partitionings for 1/2/3/4 partitionings. */
			
 
				+	unsigned int partitioning_count_all[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	/** @brief The active decimation modes, stored in low indices. */
			
 
				+	decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
			
 
				+
			
 
				+	/** @brief The active decimation tables, stored in low indices. */
			
 
				+	alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
			
 
				+
			
 
				+	/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
			
 
				+	uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The active block modes, stored in low indices. */
			
 
				+	block_mode block_modes[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The active partition tables, stored in low indices per-count. */
			
 
				+	partition_info partitionings[(3 * BLOCK_MAX_PARTITIONINGS) + 1];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The packed partition table array index, or @c BLOCK_BAD_PARTITIONING if not active.
			
 
				+	 *
			
 
				+	 * Indexed by partition_count - 2, containing 2, 3 and 4 partitions.
			
 
				+	 */
			
 
				+	uint16_t partitioning_packed_index[3][BLOCK_MAX_PARTITIONINGS];
			
 
				+
			
 
				+	/** @brief The active texels for k-means partition selection. */
			
 
				+	uint8_t kmeans_texels[BLOCK_MAX_KMEANS_TEXELS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The canonical 2-partition coverage pattern used during block partition search.
			
 
				+	 *
			
 
				+	 * Indexed by remapped index, not physical index.
			
 
				+	 */
			
 
				+	uint64_t coverage_bitmaps_2[BLOCK_MAX_PARTITIONINGS][2];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The canonical 3-partition coverage pattern used during block partition search.
			
 
				+	 *
			
 
				+	 * Indexed by remapped index, not physical index.
			
 
				+	 */
			
 
				+	uint64_t coverage_bitmaps_3[BLOCK_MAX_PARTITIONINGS][3];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The canonical 4-partition coverage pattern used during block partition search.
			
 
				+	 *
			
 
				+	 * Indexed by remapped index, not physical index.
			
 
				+	 */
			
 
				+	uint64_t coverage_bitmaps_4[BLOCK_MAX_PARTITIONINGS][4];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the block mode structure for index @c block_mode.
			
 
				+	 *
			
 
				+	 * This function can only return block modes that are enabled by the current compressor config.
			
 
				+	 * Decompression from an arbitrary source should not use this without first checking that the
			
 
				+	 * packed block mode index is not @c BLOCK_BAD_BLOCK_MODE.
			
 
				+	 *
			
 
				+	 * @param block_mode   The packed block mode index.
			
 
				+	 *
			
 
				+	 * @return The block mode structure.
			
 
				+	 */
			
 
				+	const block_mode& get_block_mode(unsigned int block_mode) const
			
 
				+	{
			
 
				+		unsigned int packed_index = this->block_mode_packed_index[block_mode];
			
 
				+		assert(packed_index != BLOCK_BAD_BLOCK_MODE && packed_index < this->block_mode_count_all);
			
 
				+		return this->block_modes[packed_index];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the decimation mode structure for index @c decimation_mode.
			
 
				+	 *
			
 
				+	 * This function can only return decimation modes that are enabled by the current compressor
			
 
				+	 * config. The mode array is stored packed, but this is only ever indexed by the packed index
			
 
				+	 * stored in the @c block_mode and never exists in an unpacked form.
			
 
				+	 *
			
 
				+	 * @param decimation_mode   The packed decimation mode index.
			
 
				+	 *
			
 
				+	 * @return The decimation mode structure.
			
 
				+	 */
			
 
				+	const decimation_mode& get_decimation_mode(unsigned int decimation_mode) const
			
 
				+	{
			
 
				+		return this->decimation_modes[decimation_mode];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the decimation info structure for index @c decimation_mode.
			
 
				+	 *
			
 
				+	 * This function can only return decimation modes that are enabled by the current compressor
			
 
				+	 * config. The mode array is stored packed, but this is only ever indexed by the packed index
			
 
				+	 * stored in the @c block_mode and never exists in an unpacked form.
			
 
				+	 *
			
 
				+	 * @param decimation_mode   The packed decimation mode index.
			
 
				+	 *
			
 
				+	 * @return The decimation info structure.
			
 
				+	 */
			
 
				+	const decimation_info& get_decimation_info(unsigned int decimation_mode) const
			
 
				+	{
			
 
				+		return this->decimation_tables[decimation_mode];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the partition info table for a given partition count.
			
 
				+	 *
			
 
				+	 * @param partition_count   The number of partitions we want the table for.
			
 
				+	 *
			
 
				+	 * @return The pointer to the table of 1024 entries (for 2/3/4 parts) or 1 entry (for 1 part).
			
 
				+	 */
			
 
				+	const partition_info* get_partition_table(unsigned int partition_count) const
			
 
				+	{
			
 
				+		if (partition_count == 1)
			
 
				+		{
			
 
				+			partition_count = 5;
			
 
				+		}
			
 
				+		unsigned int index = (partition_count - 2) * BLOCK_MAX_PARTITIONINGS;
			
 
				+		return this->partitionings + index;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the partition info structure for a given partition count and seed.
			
 
				+	 *
			
 
				+	 * @param partition_count   The number of partitions we want the info for.
			
 
				+	 * @param index             The partition seed (between 0 and 1023).
			
 
				+	 *
			
 
				+	 * @return The partition info structure.
			
 
				+	 */
			
 
				+	const partition_info& get_partition_info(unsigned int partition_count, unsigned int index) const
			
 
				+	{
			
 
				+		unsigned int packed_index = 0;
			
 
				+		if (partition_count >= 2)
			
 
				+		{
			
 
				+			packed_index = this->partitioning_packed_index[partition_count - 2][index];
			
 
				+		}
			
 
				+
			
 
				+		assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count_all[partition_count - 1]);
			
 
				+		auto& result = get_partition_table(partition_count)[packed_index];
			
 
				+		assert(index == result.partition_index);
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the partition info structure for a given partition count and seed.
			
 
				+	 *
			
 
				+	 * @param partition_count   The number of partitions we want the info for.
			
 
				+	 * @param packed_index      The raw array offset.
			
 
				+	 *
			
 
				+	 * @return The partition info structure.
			
 
				+	 */
			
 
				+	const partition_info& get_raw_partition_info(unsigned int partition_count, unsigned int packed_index) const
			
 
				+	{
			
 
				+		assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count_all[partition_count - 1]);
			
 
				+		auto& result = get_partition_table(partition_count)[packed_index];
			
 
				+		return result;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The image data for a single block.
			
 
				+ *
			
 
				+ * The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
			
 
				+ * vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
			
 
				+ * data is stored as direct UNORM data, HDR data is stored as LNS data.
			
 
				+ *
			
 
				+ * The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
			
 
				+ * decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
			
 
				+ */
			
 
				+struct image_block
			
 
				+{
			
 
				+	/** @brief The input (compress) or output (decompress) data for the red color component. */
			
 
				+	alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief The input (compress) or output (decompress) data for the green color component. */
			
 
				+	alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief The input (compress) or output (decompress) data for the blue color component. */
			
 
				+	alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief The input (compress) or output (decompress) data for the alpha color component. */
			
 
				+	alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief The number of texels in the block. */
			
 
				+	uint8_t texel_count;
			
 
				+
			
 
				+	/** @brief The original data for texel 0 for constant color block encoding. */
			
 
				+	vfloat4 origin_texel;
			
 
				+
			
 
				+	/** @brief The min component value of all texels in the block. */
			
 
				+	vfloat4 data_min;
			
 
				+
			
 
				+	/** @brief The mean component value of all texels in the block. */
			
 
				+	vfloat4 data_mean;
			
 
				+
			
 
				+	/** @brief The max component value of all texels in the block. */
			
 
				+	vfloat4 data_max;
			
 
				+
			
 
				+	/** @brief The relative error significance of the color channels. */
			
 
				+	vfloat4 channel_weight;
			
 
				+
			
 
				+	/** @brief Is this grayscale block where R == G == B for all texels? */
			
 
				+	bool grayscale;
			
 
				+
			
 
				+	/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
			
 
				+	uint8_t rgb_lns[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief Set to 1 if a texel is using HDR alpha endpoints (decompression only). */
			
 
				+	uint8_t alpha_lns[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief The X position of this block in the input or output image. */
			
 
				+	unsigned int xpos;
			
 
				+
			
 
				+	/** @brief The Y position of this block in the input or output image. */
			
 
				+	unsigned int ypos;
			
 
				+
			
 
				+	/** @brief The Z position of this block in the input or output image. */
			
 
				+	unsigned int zpos;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get an RGBA texel value from the data.
			
 
				+	 *
			
 
				+	 * @param index   The texel index.
			
 
				+	 *
			
 
				+	 * @return The texel in RGBA component ordering.
			
 
				+	 */
			
 
				+	inline vfloat4 texel(unsigned int index) const
			
 
				+	{
			
 
				+		return vfloat4(data_r[index],
			
 
				+		               data_g[index],
			
 
				+		               data_b[index],
			
 
				+		               data_a[index]);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get an RGB texel value from the data.
			
 
				+	 *
			
 
				+	 * @param index   The texel index.
			
 
				+	 *
			
 
				+	 * @return The texel in RGB0 component ordering.
			
 
				+	 */
			
 
				+	inline vfloat4 texel3(unsigned int index) const
			
 
				+	{
			
 
				+		return vfloat3(data_r[index],
			
 
				+		               data_g[index],
			
 
				+		               data_b[index]);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the default alpha value for endpoints that don't store it.
			
 
				+	 *
			
 
				+	 * The default depends on whether the alpha endpoint is LDR or HDR.
			
 
				+	 *
			
 
				+	 * @return The alpha value in the scaled range used by the compressor.
			
 
				+	 */
			
 
				+	inline float get_default_alpha() const
			
 
				+	{
			
 
				+		return this->alpha_lns[0] ? static_cast<float>(0x7800) : static_cast<float>(0xFFFF);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Test if a single color channel is constant across the block.
			
 
				+	 *
			
 
				+	 * Constant color channels are easier to compress as interpolating between two identical colors
			
 
				+	 * always returns the same value, irrespective of the weight used. They therefore can be ignored
			
 
				+	 * for the purposes of weight selection and use of a second weight plane.
			
 
				+	 *
			
 
				+	 * @return @c true if the channel is constant across the block, @c false otherwise.
			
 
				+	 */
			
 
				+	inline bool is_constant_channel(int channel) const
			
 
				+	{
			
 
				+		vmask4 lane_mask = vint4::lane_id() == vint4(channel);
			
 
				+		vmask4 color_mask = this->data_min == this->data_max;
			
 
				+		return any(lane_mask & color_mask);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Test if this block is a luminance block with constant 1.0 alpha.
			
 
				+	 *
			
 
				+	 * @return @c true if the block is a luminance block , @c false otherwise.
			
 
				+	 */
			
 
				+	inline bool is_luminance() const
			
 
				+	{
			
 
				+		float default_alpha = this->get_default_alpha();
			
 
				+		bool alpha1 = (this->data_min.lane<3>() == default_alpha) &&
			
 
				+		              (this->data_max.lane<3>() == default_alpha);
			
 
				+		return this->grayscale && alpha1;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Test if this block is a luminance block with variable alpha.
			
 
				+	 *
			
 
				+	 * @return @c true if the block is a luminance + alpha block , @c false otherwise.
			
 
				+	 */
			
 
				+	inline bool is_luminancealpha() const
			
 
				+	{
			
 
				+		float default_alpha = this->get_default_alpha();
			
 
				+		bool alpha1 = (this->data_min.lane<3>() == default_alpha) &&
			
 
				+		              (this->data_max.lane<3>() == default_alpha);
			
 
				+		return this->grayscale && !alpha1;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data structure storing the color endpoints for a block.
			
 
				+ */
			
 
				+struct endpoints
			
 
				+{
			
 
				+	/** @brief The number of partition endpoints stored. */
			
 
				+	unsigned int partition_count;
			
 
				+
			
 
				+	/** @brief The colors for endpoint 0. */
			
 
				+	vfloat4 endpt0[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	/** @brief The colors for endpoint 1. */
			
 
				+	vfloat4 endpt1[BLOCK_MAX_PARTITIONS];
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data structure storing the color endpoints and weights.
			
 
				+ */
			
 
				+struct endpoints_and_weights
			
 
				+{
			
 
				+	/** @brief True if all active values in weight_error_scale are the same. */
			
 
				+	bool is_constant_weight_error_scale;
			
 
				+
			
 
				+	/** @brief The color endpoints. */
			
 
				+	endpoints ep;
			
 
				+
			
 
				+	/** @brief The ideal weight for each texel; may be undecimated or decimated. */
			
 
				+	alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS];
			
 
				+
			
 
				+	/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
			
 
				+	alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS];
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Utility storing estimated errors from choosing particular endpoint encodings.
			
 
				+ */
			
 
				+struct encoding_choice_errors
			
 
				+{
			
 
				+	/** @brief Error of using LDR RGB-scale instead of complete endpoints. */
			
 
				+	float rgb_scale_error;
			
 
				+
			
 
				+	/** @brief Error of using HDR RGB-scale instead of complete endpoints. */
			
 
				+	float rgb_luma_error;
			
 
				+
			
 
				+	/** @brief Error of using luminance instead of RGB. */
			
 
				+	float luminance_error;
			
 
				+
			
 
				+	/** @brief Error of discarding alpha and using a constant 1.0 alpha. */
			
 
				+	float alpha_drop_error;
			
 
				+
			
 
				+	/** @brief Can we use delta offset encoding? */
			
 
				+	bool can_offset_encode;
			
 
				+
			
 
				+	/** @brief Can we use blue contraction encoding? */
			
 
				+	bool can_blue_contract;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Preallocated working buffers, allocated per thread during context creation.
			
 
				+ */
			
 
				+struct alignas(ASTCENC_VECALIGN) compression_working_buffers
			
 
				+{
			
 
				+	/** @brief Ideal endpoints and weights for plane 1. */
			
 
				+	endpoints_and_weights ei1;
			
 
				+
			
 
				+	/** @brief Ideal endpoints and weights for plane 2. */
			
 
				+	endpoints_and_weights ei2;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Decimated ideal weight values in the ~0-1 range.
			
 
				+	 *
			
 
				+	 * Note that values can be slightly below zero or higher than one due to
			
 
				+	 * endpoint extents being inside the ideal color representation.
			
 
				+	 *
			
 
				+	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
			
 
				+	 */
			
 
				+	alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Decimated quantized weight values in the unquantized 0-64 range.
			
 
				+	 *
			
 
				+	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
			
 
				+	 */
			
 
				+	uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+	/** @brief Error of the best encoding combination for each block mode. */
			
 
				+	alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The best color quant for each block mode. */
			
 
				+	uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The best color quant for each block mode if modes are the same and we have spare bits. */
			
 
				+	uint8_t best_quant_levels_mod[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The best endpoint format for each partition. */
			
 
				+	uint8_t best_ep_formats[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	/** @brief The total bit storage needed for quantized weights for each block mode. */
			
 
				+	int8_t qwt_bitcounts[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The cumulative error for quantized weights for each block mode. */
			
 
				+	float qwt_errors[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The low weight value in plane 1 for each block mode. */
			
 
				+	float weight_low_value1[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The high weight value in plane 1 for each block mode. */
			
 
				+	float weight_high_value1[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The low weight value in plane 1 for each quant level and decimation mode. */
			
 
				+	float weight_low_values1[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
			
 
				+
			
 
				+	/** @brief The high weight value in plane 1 for each quant level and decimation mode. */
			
 
				+	float weight_high_values1[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
			
 
				+
			
 
				+	/** @brief The low weight value in plane 2 for each block mode. */
			
 
				+	float weight_low_value2[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The high weight value in plane 2 for each block mode. */
			
 
				+	float weight_high_value2[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+
			
 
				+	/** @brief The low weight value in plane 2 for each quant level and decimation mode. */
			
 
				+	float weight_low_values2[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
			
 
				+
			
 
				+	/** @brief The high weight value in plane 2 for each quant level and decimation mode. */
			
 
				+	float weight_high_values2[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
			
 
				+};
			
 
				+
			
 
				+struct dt_init_working_buffers
			
 
				+{
			
 
				+	uint8_t weight_count_of_texel[BLOCK_MAX_TEXELS];
			
 
				+	uint8_t grid_weights_of_texel[BLOCK_MAX_TEXELS][4];
			
 
				+	uint8_t weights_of_texel[BLOCK_MAX_TEXELS][4];
			
 
				+
			
 
				+	uint8_t texel_count_of_weight[BLOCK_MAX_WEIGHTS];
			
 
				+	uint8_t texels_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
			
 
				+	uint8_t texel_weights_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Weight quantization transfer table.
			
 
				+ *
			
 
				+ * ASTC can store texel weights at many quantization levels, so for performance we store essential
			
 
				+ * information about each level as a precomputed data structure. Unquantized weights are integers
			
 
				+ * or floats in the range [0, 64].
			
 
				+ *
			
 
				+ * This structure provides a table, used to estimate the closest quantized weight for a given
			
 
				+ * floating-point weight. For each quantized weight, the corresponding unquantized values. For each
			
 
				+ * quantized weight, a previous-value and a next-value.
			
 
				+*/
			
 
				+struct quant_and_transfer_table
			
 
				+{
			
 
				+	/** @brief The unscrambled unquantized value. */
			
 
				+	int8_t quant_to_unquant[32];
			
 
				+
			
 
				+	/** @brief The scrambling order: scrambled_quant = map[unscrambled_quant]. */
			
 
				+	int8_t scramble_map[32];
			
 
				+
			
 
				+	/** @brief The unscrambling order: unscrambled_unquant = map[scrambled_quant]. */
			
 
				+	int8_t unscramble_and_unquant_map[32];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief A table of previous-and-next weights, indexed by the current unquantized value.
			
 
				+	 *  * bits 7:0 = previous-index, unquantized
			
 
				+	 *  * bits 15:8 = next-index, unquantized
			
 
				+	 */
			
 
				+	uint16_t prev_next_values[65];
			
 
				+};
			
 
				+
			
 
				+/** @brief The precomputed quant and transfer table. */
			
 
				+extern const quant_and_transfer_table quant_and_xfer_tables[12];
			
 
				+
			
 
				+/** @brief The block is an error block, and will return error color or NaN. */
			
 
				+static constexpr uint8_t SYM_BTYPE_ERROR { 0 };
			
 
				+
			
 
				+/** @brief The block is a constant color block using FP16 colors. */
			
 
				+static constexpr uint8_t SYM_BTYPE_CONST_F16 { 1 };
			
 
				+
			
 
				+/** @brief The block is a constant color block using UNORM16 colors. */
			
 
				+static constexpr uint8_t SYM_BTYPE_CONST_U16 { 2 };
			
 
				+
			
 
				+/** @brief The block is a normal non-constant color block. */
			
 
				+static constexpr uint8_t SYM_BTYPE_NONCONST { 3 };
			
 
				+
			
 
				+/**
			
 
				+ * @brief A symbolic representation of a compressed block.
			
 
				+ *
			
 
				+ * The symbolic representation stores the unpacked content of a single
			
 
				+ * @c physical_compressed_block, in a form which is much easier to access for
			
 
				+ * the rest of the compressor code.
			
 
				+ */
			
 
				+struct symbolic_compressed_block
			
 
				+{
			
 
				+	/** @brief The block type, one of the @c SYM_BTYPE_* constants. */
			
 
				+	uint8_t block_type;
			
 
				+
			
 
				+	/** @brief The number of partitions; valid for @c NONCONST blocks. */
			
 
				+	uint8_t partition_count;
			
 
				+
			
 
				+	/** @brief Non-zero if the color formats matched; valid for @c NONCONST blocks. */
			
 
				+	uint8_t color_formats_matched;
			
 
				+
			
 
				+	/** @brief The plane 2 color component, or -1 if single plane; valid for @c NONCONST blocks. */
			
 
				+	int8_t plane2_component;
			
 
				+
			
 
				+	/** @brief The block mode; valid for @c NONCONST blocks. */
			
 
				+	uint16_t block_mode;
			
 
				+
			
 
				+	/** @brief The partition index; valid for @c NONCONST blocks if 2 or more partitions. */
			
 
				+	uint16_t partition_index;
			
 
				+
			
 
				+	/** @brief The endpoint color formats for each partition; valid for @c NONCONST blocks. */
			
 
				+	uint8_t color_formats[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	/** @brief The endpoint color quant mode; valid for @c NONCONST blocks. */
			
 
				+	quant_method quant_mode;
			
 
				+
			
 
				+	/** @brief The error of the current encoding; valid for @c NONCONST blocks. */
			
 
				+	float errorval;
			
 
				+
			
 
				+	// We can't have both of these at the same time
			
 
				+	union {
			
 
				+		/** @brief The constant color; valid for @c CONST blocks. */
			
 
				+		int constant_color[BLOCK_MAX_COMPONENTS];
			
 
				+
			
 
				+		/** @brief The quantized endpoint color pairs; valid for @c NONCONST blocks. */
			
 
				+		uint8_t color_values[BLOCK_MAX_PARTITIONS][8];
			
 
				+	};
			
 
				+
			
 
				+	/** @brief The quantized and decimated weights.
			
 
				+	 *
			
 
				+	 * Weights are stored in the 0-64 unpacked range allowing them to be used
			
 
				+	 * directly in encoding passes without per-use unpacking. Packing happens
			
 
				+	 * when converting to/from the physical bitstream encoding.
			
 
				+	 *
			
 
				+	 * If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
			
 
				+	 */
			
 
				+	uint8_t weights[BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the weight quantization used by this block mode.
			
 
				+	 *
			
 
				+	 * @return The quantization level.
			
 
				+	 */
			
 
				+	inline quant_method get_color_quant_mode() const
			
 
				+	{
			
 
				+		return this->quant_mode;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief A physical representation of a compressed block.
			
 
				+ *
			
 
				+ * The physical representation stores the raw bytes of the format in memory.
			
 
				+ */
			
 
				+struct physical_compressed_block
			
 
				+{
			
 
				+	/** @brief The ASTC encoded data for a single block. */
			
 
				+	uint8_t data[16];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @brief Parameter structure for @c compute_pixel_region_variance().
			
 
				+ *
			
 
				+ * This function takes a structure to avoid spilling arguments to the stack on every function
			
 
				+ * invocation, as there are a lot of parameters.
			
 
				+ */
			
 
				+struct pixel_region_args
			
 
				+{
			
 
				+	/** @brief The image to analyze. */
			
 
				+	const astcenc_image* img;
			
 
				+
			
 
				+	/** @brief The component swizzle pattern. */
			
 
				+	astcenc_swizzle swz;
			
 
				+
			
 
				+	/** @brief Should the algorithm bother with Z axis processing? */
			
 
				+	bool have_z;
			
 
				+
			
 
				+	/** @brief The kernel radius for alpha processing. */
			
 
				+	unsigned int alpha_kernel_radius;
			
 
				+
			
 
				+	/** @brief The X dimension of the working data to process. */
			
 
				+	unsigned int size_x;
			
 
				+
			
 
				+	/** @brief The Y dimension of the working data to process. */
			
 
				+	unsigned int size_y;
			
 
				+
			
 
				+	/** @brief The Z dimension of the working data to process. */
			
 
				+	unsigned int size_z;
			
 
				+
			
 
				+	/** @brief The X position of first src and dst data in the data set. */
			
 
				+	unsigned int offset_x;
			
 
				+
			
 
				+	/** @brief The Y position of first src and dst data in the data set. */
			
 
				+	unsigned int offset_y;
			
 
				+
			
 
				+	/** @brief The Z position of first src and dst data in the data set. */
			
 
				+	unsigned int offset_z;
			
 
				+
			
 
				+	/** @brief The working memory buffer. */
			
 
				+	vfloat4 *work_memory;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Parameter structure for @c compute_averages_proc().
			
 
				+ */
			
 
				+struct avg_args
			
 
				+{
			
 
				+	/** @brief The arguments for the nested variance computation. */
			
 
				+	pixel_region_args arg;
			
 
				+
			
 
				+	/** @brief The image X dimensions. */
			
 
				+	unsigned int img_size_x;
			
 
				+
			
 
				+	/** @brief The image Y dimensions. */
			
 
				+	unsigned int img_size_y;
			
 
				+
			
 
				+	/** @brief The image Z dimensions. */
			
 
				+	unsigned int img_size_z;
			
 
				+
			
 
				+	/** @brief The maximum working block dimensions in X and Y dimensions. */
			
 
				+	unsigned int blk_size_xy;
			
 
				+
			
 
				+	/** @brief The maximum working block dimensions in Z dimensions. */
			
 
				+	unsigned int blk_size_z;
			
 
				+
			
 
				+	/** @brief The working block memory size. */
			
 
				+	unsigned int work_memory_size;
			
 
				+};
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+/* See astcenc_diagnostic_trace header for details. */
			
 
				+class TraceLog;
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * @brief The astcenc compression context.
			
 
				+ */
			
 
				+struct astcenc_contexti
			
 
				+{
			
 
				+	/** @brief The configuration this context was created with. */
			
 
				+	astcenc_config config;
			
 
				+
			
 
				+	/** @brief The thread count supported by this context. */
			
 
				+	unsigned int thread_count;
			
 
				+
			
 
				+	/** @brief The block size descriptor this context was created with. */
			
 
				+	block_size_descriptor* bsd;
			
 
				+
			
 
				+	/*
			
 
				+	 * Fields below here are not needed in a decompress-only build, but some remain as they are
			
 
				+	 * small and it avoids littering the code with #ifdefs. The most significant contributors to
			
 
				+	 * large structure size are omitted.
			
 
				+	 */
			
 
				+
			
 
				+	/** @brief The input image alpha channel averages table, may be @c nullptr if not needed. */
			
 
				+	float* input_alpha_averages;
			
 
				+
			
 
				+	/** @brief The scratch working buffers, one per thread (see @c thread_count). */
			
 
				+	compression_working_buffers* working_buffers;
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	/** @brief The pixel region and variance worker arguments. */
			
 
				+	avg_args avg_preprocess_args;
			
 
				+#endif
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	/**
			
 
				+	 * @brief The diagnostic trace logger.
			
 
				+	 *
			
 
				+	 * Note that this is a singleton, so can only be used in single threaded mode. It only exists
			
 
				+	 * here so we have a reference to close the file at the end of the capture.
			
 
				+	 */
			
 
				+	TraceLog* trace_log;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Functionality for managing block sizes and partition tables.
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief Populate the block size descriptor for the target block size.
			
 
				+ *
			
 
				+ * This will also initialize the partition table metadata, which is stored as part of the BSD
			
 
				+ * structure.
			
 
				+ *
			
 
				+ * @param      x_texels                 The number of texels in the block X dimension.
			
 
				+ * @param      y_texels                 The number of texels in the block Y dimension.
			
 
				+ * @param      z_texels                 The number of texels in the block Z dimension.
			
 
				+ * @param      can_omit_modes           Can we discard modes and partitionings that astcenc won't use?
			
 
				+ * @param      partition_count_cutoff   The partition count cutoff to use, if we can omit partitionings.
			
 
				+ * @param      mode_cutoff              The block mode percentile cutoff [0-1].
			
 
				+ * @param[out] bsd                      The descriptor to initialize.
			
 
				+ */
			
 
				+void init_block_size_descriptor(
			
 
				+	unsigned int x_texels,
			
 
				+	unsigned int y_texels,
			
 
				+	unsigned int z_texels,
			
 
				+	bool can_omit_modes,
			
 
				+	unsigned int partition_count_cutoff,
			
 
				+	float mode_cutoff,
			
 
				+	block_size_descriptor& bsd);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Populate the partition tables for the target block size.
			
 
				+ *
			
 
				+ * Note the @c bsd descriptor must be initialized by calling @c init_block_size_descriptor() before
			
 
				+ * calling this function.
			
 
				+ *
			
 
				+ * @param[out] bsd                      The block size information structure to populate.
			
 
				+ * @param      can_omit_partitionings   True if we can we drop partitionings that astcenc won't use.
			
 
				+ * @param      partition_count_cutoff   The partition count cutoff to use, if we can omit partitionings.
			
 
				+ */
			
 
				+void init_partition_tables(
			
 
				+	block_size_descriptor& bsd,
			
 
				+	bool can_omit_partitionings,
			
 
				+	unsigned int partition_count_cutoff);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Get the percentile table for 2D block modes.
			
 
				+ *
			
 
				+ * This is an empirically determined prioritization of which block modes to use in the search in
			
 
				+ * terms of their centile (lower centiles = more useful).
			
 
				+ *
			
 
				+ * Returns a dynamically allocated array; caller must free with delete[].
			
 
				+ *
			
 
				+ * @param xdim The block x size.
			
 
				+ * @param ydim The block y size.
			
 
				+ *
			
 
				+ * @return The unpacked table.
			
 
				+ */
			
 
				+const float* get_2d_percentile_table(
			
 
				+	unsigned int xdim,
			
 
				+	unsigned int ydim);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Query if a 2D block size is legal.
			
 
				+ *
			
 
				+ * @return True if legal, false otherwise.
			
 
				+ */
			
 
				+bool is_legal_2d_block_size(
			
 
				+	unsigned int xdim,
			
 
				+	unsigned int ydim);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Query if a 3D block size is legal.
			
 
				+ *
			
 
				+ * @return True if legal, false otherwise.
			
 
				+ */
			
 
				+bool is_legal_3d_block_size(
			
 
				+	unsigned int xdim,
			
 
				+	unsigned int ydim,
			
 
				+	unsigned int zdim);
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Functionality for managing BISE quantization and unquantization.
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief The precomputed table for quantizing color values.
			
 
				+ *
			
 
				+ * Converts unquant value in 0-255 range into quant value in 0-255 range.
			
 
				+ * No BISE scrambling is applied at this stage.
			
 
				+ *
			
 
				+ * Indexed by [quant_mode - 4][data_value].
			
 
				+ */
			
 
				+extern const uint8_t color_unquant_to_uquant_tables[17][256];
			
 
				+
			
 
				+/**
			
 
				+ * @brief The precomputed table for packing quantized color values.
			
 
				+ *
			
 
				+ * Converts quant value in 0-255 range into packed quant value in 0-N range,
			
 
				+ * with BISE scrambling applied.
			
 
				+ *
			
 
				+ * Indexed by [quant_mode - 4][data_value].
			
 
				+ */
			
 
				+extern const uint8_t color_uquant_to_scrambled_pquant_tables[17][256];
			
 
				+
			
 
				+/**
			
 
				+ * @brief The precomputed table for unpacking color values.
			
 
				+ *
			
 
				+ * Converts quant value in 0-N range into unpacked value in 0-255 range,
			
 
				+ * with BISE unscrambling applied.
			
 
				+ *
			
 
				+ * Indexed by [quant_mode - 4][data_value].
			
 
				+ */
			
 
				+extern const uint8_t* color_scrambled_pquant_to_uquant_tables[17];
			
 
				+
			
 
				+/**
			
 
				+ * @brief The precomputed quant mode storage table.
			
 
				+ *
			
 
				+ * Indexing by [integer_count/2][bits] gives us the quantization level for a given integer count and
			
 
				+ * number of compressed storage bits. Returns -1 for cases where the requested integer count cannot
			
 
				+ * ever fit in the supplied storage size.
			
 
				+ */
			
 
				+extern const int8_t quant_mode_table[10][128];
			
 
				+
			
 
				+/**
			
 
				+ * @brief Encode a packed string using BISE.
			
 
				+ *
			
 
				+ * Note that BISE can return strings that are not a whole number of bytes in length, and ASTC can
			
 
				+ * start storing strings in a block at arbitrary bit offsets in the encoded data.
			
 
				+ *
			
 
				+ * @param         quant_level       The BISE alphabet size.
			
 
				+ * @param         character_count   The number of characters in the string.
			
 
				+ * @param         input_data        The unpacked string, one byte per character.
			
 
				+ * @param[in,out] output_data       The output packed string.
			
 
				+ * @param         bit_offset        The starting offset in the output storage.
			
 
				+ */
			
 
				+void encode_ise(
			
 
				+	quant_method quant_level,
			
 
				+	unsigned int character_count,
			
 
				+	const uint8_t* input_data,
			
 
				+	uint8_t* output_data,
			
 
				+	unsigned int bit_offset);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Decode a packed string using BISE.
			
 
				+ *
			
 
				+ * Note that BISE input strings are not a whole number of bytes in length, and ASTC can start
			
 
				+ * strings at arbitrary bit offsets in the encoded data.
			
 
				+ *
			
 
				+ * @param         quant_level       The BISE alphabet size.
			
 
				+ * @param         character_count   The number of characters in the string.
			
 
				+ * @param         input_data        The packed string.
			
 
				+ * @param[in,out] output_data       The output storage, one byte per character.
			
 
				+ * @param         bit_offset        The starting offset in the output storage.
			
 
				+ */
			
 
				+void decode_ise(
			
 
				+	quant_method quant_level,
			
 
				+	unsigned int character_count,
			
 
				+	const uint8_t* input_data,
			
 
				+	uint8_t* output_data,
			
 
				+	unsigned int bit_offset);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the number of bits needed to encode an ISE sequence.
			
 
				+ *
			
 
				+ * This implementation assumes that the @c quant level is untrusted, given it may come from random
			
 
				+ * data being decompressed, so we return an arbitrary unencodable size if that is the case.
			
 
				+ *
			
 
				+ * @param character_count   The number of items in the sequence.
			
 
				+ * @param quant_level       The desired quantization level.
			
 
				+ *
			
 
				+ * @return The number of bits needed to encode the BISE string.
			
 
				+ */
			
 
				+unsigned int get_ise_sequence_bitcount(
			
 
				+	unsigned int character_count,
			
 
				+	quant_method quant_level);
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Functionality for managing color partitioning.
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute averages and dominant directions for each partition in a 2 component texture.
			
 
				+ *
			
 
				+ * @param      pi           The partition info for the current trial.
			
 
				+ * @param      blk          The image block color data to be compressed.
			
 
				+ * @param      component1   The first component included in the analysis.
			
 
				+ * @param      component2   The second component included in the analysis.
			
 
				+ * @param[out] pm           The output partition metrics.
			
 
				+ *                          - Only pi.partition_count array entries actually get initialized.
			
 
				+ *                          - Direction vectors @c pm.dir are not normalized.
			
 
				+ */
			
 
				+void compute_avgs_and_dirs_2_comp(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int component1,
			
 
				+	unsigned int component2,
			
 
				+	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute averages and dominant directions for each partition in a 3 component texture.
			
 
				+ *
			
 
				+ * @param      pi                  The partition info for the current trial.
			
 
				+ * @param      blk                 The image block color data to be compressed.
			
 
				+ * @param      omitted_component   The component excluded from the analysis.
			
 
				+ * @param[out] pm                  The output partition metrics.
			
 
				+ *                                 - Only pi.partition_count array entries actually get initialized.
			
 
				+ *                                 - Direction vectors @c pm.dir are not normalized.
			
 
				+ */
			
 
				+void compute_avgs_and_dirs_3_comp(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int omitted_component,
			
 
				+	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute averages and dominant directions for each partition in a 3 component texture.
			
 
				+ *
			
 
				+ * This is a specialization of @c compute_avgs_and_dirs_3_comp where the omitted component is
			
 
				+ * always alpha, a common case during partition search.
			
 
				+ *
			
 
				+ * @param      pi    The partition info for the current trial.
			
 
				+ * @param      blk   The image block color data to be compressed.
			
 
				+ * @param[out] pm    The output partition metrics.
			
 
				+ *                   - Only pi.partition_count array entries actually get initialized.
			
 
				+ *                   - Direction vectors @c pm.dir are not normalized.
			
 
				+ */
			
 
				+void compute_avgs_and_dirs_3_comp_rgb(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute averages and dominant directions for each partition in a 4 component texture.
			
 
				+ *
			
 
				+ * @param      pi    The partition info for the current trial.
			
 
				+ * @param      blk   The image block color data to be compressed.
			
 
				+ * @param[out] pm    The output partition metrics.
			
 
				+ *                   - Only pi.partition_count array entries actually get initialized.
			
 
				+ *                   - Direction vectors @c pm.dir are not normalized.
			
 
				+ */
			
 
				+void compute_avgs_and_dirs_4_comp(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the RGB error for uncorrelated and same chroma projections.
			
 
				+ *
			
 
				+ * The output of compute averages and dirs is post processed to define two lines, both of which go
			
 
				+ * through the mean-color-value.  One line has a direction defined by the dominant direction; this
			
 
				+ * is used to assess the error from using an uncorrelated color representation. The other line goes
			
 
				+ * through (0,0,0) and is used to assess the error from using an RGBS color representation.
			
 
				+ *
			
 
				+ * This function computes the squared error when using these two representations.
			
 
				+ *
			
 
				+ * @param         pi            The partition info for the current trial.
			
 
				+ * @param         blk           The image block color data to be compressed.
			
 
				+ * @param[in,out] plines        Processed line inputs, and line length outputs.
			
 
				+ * @param[out]    uncor_error   The cumulative error for using the uncorrelated line.
			
 
				+ * @param[out]    samec_error   The cumulative error for using the same chroma line.
			
 
				+ */
			
 
				+void compute_error_squared_rgb(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
			
 
				+	float& uncor_error,
			
 
				+	float& samec_error);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the RGBA error for uncorrelated and same chroma projections.
			
 
				+ *
			
 
				+ * The output of compute averages and dirs is post processed to define two lines, both of which go
			
 
				+ * through the mean-color-value.  One line has a direction defined by the dominant direction; this
			
 
				+ * is used to assess the error from using an uncorrelated color representation. The other line goes
			
 
				+ * through (0,0,0,1) and is used to assess the error from using an RGBS color representation.
			
 
				+ *
			
 
				+ * This function computes the squared error when using these two representations.
			
 
				+ *
			
 
				+ * @param      pi              The partition info for the current trial.
			
 
				+ * @param      blk             The image block color data to be compressed.
			
 
				+ * @param      uncor_plines    Processed uncorrelated partition lines for each partition.
			
 
				+ * @param      samec_plines    Processed same chroma partition lines for each partition.
			
 
				+ * @param[out] uncor_lengths   The length of each components deviation from the line.
			
 
				+ * @param[out] samec_lengths   The length of each components deviation from the line.
			
 
				+ * @param[out] uncor_error     The cumulative error for using the uncorrelated line.
			
 
				+ * @param[out] samec_error     The cumulative error for using the same chroma line.
			
 
				+ */
			
 
				+void compute_error_squared_rgba(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
			
 
				+	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
			
 
				+	float uncor_lengths[BLOCK_MAX_PARTITIONS],
			
 
				+	float samec_lengths[BLOCK_MAX_PARTITIONS],
			
 
				+	float& uncor_error,
			
 
				+	float& samec_error);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Find the best set of partitions to trial for a given block.
			
 
				+ *
			
 
				+ * On return the @c best_partitions list will contain the two best partition
			
 
				+ * candidates; one assuming data has uncorrelated chroma and one assuming the
			
 
				+ * data has correlated chroma. The best candidate is returned first in the list.
			
 
				+ *
			
 
				+ * @param      bsd                      The block size information.
			
 
				+ * @param      blk                      The image block color data to compress.
			
 
				+ * @param      partition_count          The number of partitions in the block.
			
 
				+ * @param      partition_search_limit   The number of candidate partition encodings to trial.
			
 
				+ * @param[out] best_partitions          The best partition candidates.
			
 
				+ * @param      requested_candidates     The number of requested partitionings. May return fewer if
			
 
				+ *                                      candidates are not available.
			
 
				+ *
			
 
				+ * @return The actual number of candidates returned.
			
 
				+ */
			
 
				+unsigned int find_best_partition_candidates(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int partition_count,
			
 
				+	unsigned int partition_search_limit,
			
 
				+	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
			
 
				+	unsigned int requested_candidates);
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Functionality for managing images and image related data.
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief Setup computation of regional averages in an image.
			
 
				+ *
			
 
				+ * This must be done by only a single thread per image, before any thread calls
			
 
				+ * @c compute_averages().
			
 
				+ *
			
 
				+ * Results are written back into @c img->input_alpha_averages.
			
 
				+ *
			
 
				+ * @param      img                   The input image data, also holds output data.
			
 
				+ * @param      alpha_kernel_radius   The kernel radius (in pixels) for alpha mods.
			
 
				+ * @param      swz                   Input data component swizzle.
			
 
				+ * @param[out] ag                    The average variance arguments to init.
			
 
				+ *
			
 
				+ * @return The number of tasks in the processing stage.
			
 
				+ */
			
 
				+unsigned int init_compute_averages(
			
 
				+	const astcenc_image& img,
			
 
				+	unsigned int alpha_kernel_radius,
			
 
				+	const astcenc_swizzle& swz,
			
 
				+	avg_args& ag);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute averages for a pixel region.
			
 
				+ *
			
 
				+ * The routine computes both in a single pass, using a summed-area table to decouple the running
			
 
				+ * time from the averaging/variance kernel size.
			
 
				+ *
			
 
				+ * @param[out] ctx   The compressor context storing the output data.
			
 
				+ * @param      arg   The input parameter structure.
			
 
				+ */
			
 
				+void compute_pixel_region_variance(
			
 
				+	astcenc_contexti& ctx,
			
 
				+	const pixel_region_args& arg);
			
 
				+/**
			
 
				+ * @brief Load a single image block from the input image.
			
 
				+ *
			
 
				+ * @param      decode_mode   The compression color profile.
			
 
				+ * @param      img           The input image data.
			
 
				+ * @param[out] blk           The image block to populate.
			
 
				+ * @param      bsd           The block size information.
			
 
				+ * @param      xpos          The block X coordinate in the input image.
			
 
				+ * @param      ypos          The block Y coordinate in the input image.
			
 
				+ * @param      zpos          The block Z coordinate in the input image.
			
 
				+ * @param      swz           The swizzle to apply on load.
			
 
				+ */
			
 
				+void load_image_block(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const astcenc_image& img,
			
 
				+	image_block& blk,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	unsigned int xpos,
			
 
				+	unsigned int ypos,
			
 
				+	unsigned int zpos,
			
 
				+	const astcenc_swizzle& swz);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a single image block from the input image.
			
 
				+ *
			
 
				+ * This specialized variant can be used only if the block is 2D LDR U8 data,
			
 
				+ * with no swizzle.
			
 
				+ *
			
 
				+ * @param      decode_mode   The compression color profile.
			
 
				+ * @param      img           The input image data.
			
 
				+ * @param[out] blk           The image block to populate.
			
 
				+ * @param      bsd           The block size information.
			
 
				+ * @param      xpos          The block X coordinate in the input image.
			
 
				+ * @param      ypos          The block Y coordinate in the input image.
			
 
				+ * @param      zpos          The block Z coordinate in the input image.
			
 
				+ * @param      swz           The swizzle to apply on load.
			
 
				+ */
			
 
				+void load_image_block_fast_ldr(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const astcenc_image& img,
			
 
				+	image_block& blk,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	unsigned int xpos,
			
 
				+	unsigned int ypos,
			
 
				+	unsigned int zpos,
			
 
				+	const astcenc_swizzle& swz);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a single image block to the output image.
			
 
				+ *
			
 
				+ * @param[out] img    The output image data.
			
 
				+ * @param      blk    The image block to export.
			
 
				+ * @param      bsd    The block size information.
			
 
				+ * @param      xpos   The block X coordinate in the input image.
			
 
				+ * @param      ypos   The block Y coordinate in the input image.
			
 
				+ * @param      zpos   The block Z coordinate in the input image.
			
 
				+ * @param      swz    The swizzle to apply on store.
			
 
				+ */
			
 
				+void store_image_block(
			
 
				+	astcenc_image& img,
			
 
				+	const image_block& blk,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	unsigned int xpos,
			
 
				+	unsigned int ypos,
			
 
				+	unsigned int zpos,
			
 
				+	const astcenc_swizzle& swz);
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Functionality for computing endpoint colors and weights for a block.
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute ideal endpoint colors and weights for 1 plane of weights.
			
 
				+ *
			
 
				+ * The ideal endpoints define a color line for the partition. For each texel the ideal weight
			
 
				+ * defines an exact position on the partition color line. We can then use these to assess the error
			
 
				+ * introduced by removing and quantizing the weight grid.
			
 
				+ *
			
 
				+ * @param      blk   The image block color data to compress.
			
 
				+ * @param      pi    The partition info for the current trial.
			
 
				+ * @param[out] ei    The endpoint and weight values.
			
 
				+ */
			
 
				+void compute_ideal_colors_and_weights_1plane(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	endpoints_and_weights& ei);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute ideal endpoint colors and weights for 2 planes of weights.
			
 
				+ *
			
 
				+ * The ideal endpoints define a color line for the partition. For each texel the ideal weight
			
 
				+ * defines an exact position on the partition color line. We can then use these to assess the error
			
 
				+ * introduced by removing and quantizing the weight grid.
			
 
				+ *
			
 
				+ * @param      bsd                The block size information.
			
 
				+ * @param      blk                The image block color data to compress.
			
 
				+ * @param      plane2_component   The component assigned to plane 2.
			
 
				+ * @param[out] ei1                The endpoint and weight values for plane 1.
			
 
				+ * @param[out] ei2                The endpoint and weight values for plane 2.
			
 
				+ */
			
 
				+void compute_ideal_colors_and_weights_2planes(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const image_block& blk,
			
 
				+	unsigned int plane2_component,
			
 
				+	endpoints_and_weights& ei1,
			
 
				+	endpoints_and_weights& ei2);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the optimal unquantized weights for a decimation table.
			
 
				+ *
			
 
				+ * After computing ideal weights for the case for a complete weight grid, we we want to compute the
			
 
				+ * ideal weights for the case where weights exist only for some texels. We do this with a
			
 
				+ * steepest-descent grid solver which works as follows:
			
 
				+ *
			
 
				+ * First, for each actual weight, perform a weighted averaging of the texels affected by the weight.
			
 
				+ * Then, set step size to <some initial value> and attempt one step towards the original ideal
			
 
				+ * weight if it helps to reduce error.
			
 
				+ *
			
 
				+ * @param      ei                       The non-decimated endpoints and weights.
			
 
				+ * @param      di                       The selected weight decimation.
			
 
				+ * @param[out] dec_weight_ideal_value   The ideal values for the decimated weight set.
			
 
				+ */
			
 
				+void compute_ideal_weights_for_decimation(
			
 
				+	const endpoints_and_weights& ei,
			
 
				+	const decimation_info& di,
			
 
				+	float* dec_weight_ideal_value);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the optimal quantized weights for a decimation table.
			
 
				+ *
			
 
				+ * We test the two closest weight indices in the allowed quantization range and keep the weight that
			
 
				+ * is the closest match.
			
 
				+ *
			
 
				+ * @param      di                        The selected weight decimation.
			
 
				+ * @param      low_bound                 The lowest weight allowed.
			
 
				+ * @param      high_bound                The highest weight allowed.
			
 
				+ * @param      dec_weight_ideal_value    The ideal weight set.
			
 
				+ * @param[out] dec_weight_quant_uvalue   The output quantized weight as a float.
			
 
				+ * @param[out] dec_weight_uquant         The output quantized weight as encoded int.
			
 
				+ * @param      quant_level               The desired weight quant level.
			
 
				+ */
			
 
				+void compute_quantized_weights_for_decimation(
			
 
				+	const decimation_info& di,
			
 
				+	float low_bound,
			
 
				+	float high_bound,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	float* dec_weight_quant_uvalue,
			
 
				+	uint8_t* dec_weight_uquant,
			
 
				+	quant_method quant_level);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the error of a decimated weight set for 1 plane.
			
 
				+ *
			
 
				+ * After computing ideal weights for the case with one weight per texel, we want to compute the
			
 
				+ * error for decimated weight grids where weights are stored at a lower resolution. This function
			
 
				+ * computes the error of the reduced grid, compared to the full grid.
			
 
				+ *
			
 
				+ * @param eai                       The ideal weights for the full grid.
			
 
				+ * @param di                        The selected weight decimation.
			
 
				+ * @param dec_weight_quant_uvalue   The quantized weights for the decimated grid.
			
 
				+ *
			
 
				+ * @return The accumulated error.
			
 
				+ */
			
 
				+float compute_error_of_weight_set_1plane(
			
 
				+	const endpoints_and_weights& eai,
			
 
				+	const decimation_info& di,
			
 
				+	const float* dec_weight_quant_uvalue);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the error of a decimated weight set for 2 planes.
			
 
				+ *
			
 
				+ * After computing ideal weights for the case with one weight per texel, we want to compute the
			
 
				+ * error for decimated weight grids where weights are stored at a lower resolution. This function
			
 
				+ * computes the error of the reduced grid, compared to the full grid.
			
 
				+ *
			
 
				+ * @param eai1                             The ideal weights for the full grid and plane 1.
			
 
				+ * @param eai2                             The ideal weights for the full grid and plane 2.
			
 
				+ * @param di                               The selected weight decimation.
			
 
				+ * @param dec_weight_quant_uvalue_plane1   The quantized weights for the decimated grid plane 1.
			
 
				+ * @param dec_weight_quant_uvalue_plane2   The quantized weights for the decimated grid plane 2.
			
 
				+ *
			
 
				+ * @return The accumulated error.
			
 
				+ */
			
 
				+float compute_error_of_weight_set_2planes(
			
 
				+	const endpoints_and_weights& eai1,
			
 
				+	const endpoints_and_weights& eai2,
			
 
				+	const decimation_info& di,
			
 
				+	const float* dec_weight_quant_uvalue_plane1,
			
 
				+	const float* dec_weight_quant_uvalue_plane2);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Pack a single pair of color endpoints as effectively as possible.
			
 
				+ *
			
 
				+ * The user requests a base color endpoint mode in @c format, but the quantizer may choose a
			
 
				+ * delta-based representation. It will report back the format variant it actually used.
			
 
				+ *
			
 
				+ * @param      color0        The input unquantized color0 endpoint for absolute endpoint pairs.
			
 
				+ * @param      color1        The input unquantized color1 endpoint for absolute endpoint pairs.
			
 
				+ * @param      rgbs_color    The input unquantized RGBS variant endpoint for same chroma endpoints.
			
 
				+ * @param      rgbo_color    The input unquantized RGBS variant endpoint for HDR endpoints.
			
 
				+ * @param      format        The desired base format.
			
 
				+ * @param[out] output        The output storage for the quantized colors/
			
 
				+ * @param      quant_level   The quantization level requested.
			
 
				+ *
			
 
				+ * @return The actual endpoint mode used.
			
 
				+ */
			
 
				+uint8_t pack_color_endpoints(
			
 
				+	vfloat4 color0,
			
 
				+	vfloat4 color1,
			
 
				+	vfloat4 rgbs_color,
			
 
				+	vfloat4 rgbo_color,
			
 
				+	int format,
			
 
				+	uint8_t* output,
			
 
				+	quant_method quant_level);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack a single pair of encoded endpoints.
			
 
				+ *
			
 
				+ * Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
			
 
				+ *
			
 
				+ * @param      decode_mode   The decode mode (LDR, HDR).
			
 
				+ * @param      format        The color endpoint mode used.
			
 
				+ * @param      input         The raw array of encoded input integers. The length of this array
			
 
				+ *                           depends on @c format; it can be safely assumed to be large enough.
			
 
				+ * @param[out] rgb_hdr       Is the endpoint using HDR for the RGB channels?
			
 
				+ * @param[out] alpha_hdr     Is the endpoint using HDR for the A channel?
			
 
				+ * @param[out] output0       The output color for endpoint 0.
			
 
				+ * @param[out] output1       The output color for endpoint 1.
			
 
				+ */
			
 
				+void unpack_color_endpoints(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	int format,
			
 
				+	const uint8_t* input,
			
 
				+	bool& rgb_hdr,
			
 
				+	bool& alpha_hdr,
			
 
				+	vint4& output0,
			
 
				+	vint4& output1);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Unpack a set of quantized and decimated weights.
			
 
				+ *
			
 
				+ * TODO: Can we skip this for non-decimated weights now that the @c scb is
			
 
				+ * already storing unquantized weights?
			
 
				+ *
			
 
				+ * @param      bsd              The block size information.
			
 
				+ * @param      scb              The symbolic compressed encoding.
			
 
				+ * @param      di               The weight grid decimation table.
			
 
				+ * @param      is_dual_plane    @c true if this is a dual plane block, @c false otherwise.
			
 
				+ * @param[out] weights_plane1   The output array for storing the plane 1 weights.
			
 
				+ * @param[out] weights_plane2   The output array for storing the plane 2 weights.
			
 
				+ */
			
 
				+void unpack_weights(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	const decimation_info& di,
			
 
				+	bool is_dual_plane,
			
 
				+	int weights_plane1[BLOCK_MAX_TEXELS],
			
 
				+	int weights_plane2[BLOCK_MAX_TEXELS]);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Identify, for each mode, which set of color endpoint produces the best result.
			
 
				+ *
			
 
				+ * Returns the best @c tune_candidate_limit best looking modes, along with the ideal color encoding
			
 
				+ * combination for each. The modified quantization level can be used when all formats are the same,
			
 
				+ * as this frees up two additional bits of storage.
			
 
				+ *
			
 
				+ * @param      pi                            The partition info for the current trial.
			
 
				+ * @param      blk                           The image block color data to compress.
			
 
				+ * @param      ep                            The ideal endpoints.
			
 
				+ * @param      qwt_bitcounts                 Bit counts for different quantization methods.
			
 
				+ * @param      qwt_errors                    Errors for different quantization methods.
			
 
				+ * @param      tune_candidate_limit          The max number of candidates to return, may be less.
			
 
				+ * @param      start_block_mode              The first block mode to inspect.
			
 
				+ * @param      end_block_mode                The last block mode to inspect.
			
 
				+ * @param[out] partition_format_specifiers   The best formats per partition.
			
 
				+ * @param[out] block_mode                    The best packed block mode indexes.
			
 
				+ * @param[out] quant_level                   The best color quant level.
			
 
				+ * @param[out] quant_level_mod               The best color quant level if endpoints are the same.
			
 
				+ * @param[out] tmpbuf                        Preallocated scratch buffers for the compressor.
			
 
				+ *
			
 
				+ * @return The actual number of candidate matches returned.
			
 
				+ */
			
 
				+unsigned int compute_ideal_endpoint_formats(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	const endpoints& ep,
			
 
				+	const int8_t* qwt_bitcounts,
			
 
				+	const float* qwt_errors,
			
 
				+	unsigned int tune_candidate_limit,
			
 
				+	unsigned int start_block_mode,
			
 
				+	unsigned int end_block_mode,
			
 
				+	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
			
 
				+	int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
			
 
				+	quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
			
 
				+	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
			
 
				+	compression_working_buffers& tmpbuf);
			
 
				+
			
 
				+/**
			
 
				+ * @brief For a given 1 plane weight set recompute the endpoint colors.
			
 
				+ *
			
 
				+ * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
			
 
				+ * recompute the ideal colors for a specific weight set.
			
 
				+ *
			
 
				+ * @param         blk                  The image block color data to compress.
			
 
				+ * @param         pi                   The partition info for the current trial.
			
 
				+ * @param         di                   The weight grid decimation table.
			
 
				+ * @param         dec_weights_uquant   The quantized weight set.
			
 
				+ * @param[in,out] ep                   The color endpoints (modifed in place).
			
 
				+ * @param[out]    rgbs_vectors         The RGB+scale vectors for LDR blocks.
			
 
				+ * @param[out]    rgbo_vectors         The RGB+offset vectors for HDR blocks.
			
 
				+ */
			
 
				+void recompute_ideal_colors_1plane(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	const decimation_info& di,
			
 
				+	const uint8_t* dec_weights_uquant,
			
 
				+	endpoints& ep,
			
 
				+	vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
			
 
				+	vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]);
			
 
				+
			
 
				+/**
			
 
				+ * @brief For a given 2 plane weight set recompute the endpoint colors.
			
 
				+ *
			
 
				+ * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
			
 
				+ * recompute the ideal colors for a specific weight set.
			
 
				+ *
			
 
				+ * @param         blk                         The image block color data to compress.
			
 
				+ * @param         bsd                         The block_size descriptor.
			
 
				+ * @param         di                          The weight grid decimation table.
			
 
				+ * @param         dec_weights_uquant_plane1   The quantized weight set for plane 1.
			
 
				+ * @param         dec_weights_uquant_plane2   The quantized weight set for plane 2.
			
 
				+ * @param[in,out] ep                          The color endpoints (modifed in place).
			
 
				+ * @param[out]    rgbs_vector                 The RGB+scale color for LDR blocks.
			
 
				+ * @param[out]    rgbo_vector                 The RGB+offset color for HDR blocks.
			
 
				+ * @param         plane2_component            The component assigned to plane 2.
			
 
				+ */
			
 
				+void recompute_ideal_colors_2planes(
			
 
				+	const image_block& blk,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const decimation_info& di,
			
 
				+	const uint8_t* dec_weights_uquant_plane1,
			
 
				+	const uint8_t* dec_weights_uquant_plane2,
			
 
				+	endpoints& ep,
			
 
				+	vfloat4& rgbs_vector,
			
 
				+	vfloat4& rgbo_vector,
			
 
				+	int plane2_component);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Expand the angular tables needed for the alternative to PCA that we use.
			
 
				+ */
			
 
				+void prepare_angular_tables();
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the angular endpoints for one plane for each block mode.
			
 
				+ *
			
 
				+ * @param      only_always              Only consider block modes that are always enabled.
			
 
				+ * @param      bsd                      The block size descriptor for the current trial.
			
 
				+ * @param      dec_weight_ideal_value   The ideal decimated unquantized weight values.
			
 
				+ * @param      max_weight_quant         The maximum block mode weight quantization allowed.
			
 
				+ * @param[out] tmpbuf                   Preallocated scratch buffers for the compressor.
			
 
				+ */
			
 
				+void compute_angular_endpoints_1plane(
			
 
				+	bool only_always,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	unsigned int max_weight_quant,
			
 
				+	compression_working_buffers& tmpbuf);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the angular endpoints for two planes for each block mode.
			
 
				+ *
			
 
				+ * @param      bsd                      The block size descriptor for the current trial.
			
 
				+ * @param      dec_weight_ideal_value   The ideal decimated unquantized weight values.
			
 
				+ * @param      max_weight_quant         The maximum block mode weight quantization allowed.
			
 
				+ * @param[out] tmpbuf                   Preallocated scratch buffers for the compressor.
			
 
				+ */
			
 
				+void compute_angular_endpoints_2planes(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	unsigned int max_weight_quant,
			
 
				+	compression_working_buffers& tmpbuf);
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Functionality for high level compression and decompression access.
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compress an image block into a physical block.
			
 
				+ *
			
 
				+ * @param      ctx      The compressor context and configuration.
			
 
				+ * @param      blk      The image block color data to compress.
			
 
				+ * @param[out] pcb      The physical compressed block output.
			
 
				+ * @param[out] tmpbuf   Preallocated scratch buffers for the compressor.
			
 
				+ */
			
 
				+void compress_block(
			
 
				+	const astcenc_contexti& ctx,
			
 
				+	const image_block& blk,
			
 
				+	physical_compressed_block& pcb,
			
 
				+	compression_working_buffers& tmpbuf);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Decompress a symbolic block in to an image block.
			
 
				+ *
			
 
				+ * @param      decode_mode   The decode mode (LDR, HDR, etc).
			
 
				+ * @param      bsd           The block size information.
			
 
				+ * @param      xpos          The X coordinate of the block in the overall image.
			
 
				+ * @param      ypos          The Y coordinate of the block in the overall image.
			
 
				+ * @param      zpos          The Z coordinate of the block in the overall image.
			
 
				+ * @param[out] blk           The decompressed image block color data.
			
 
				+ */
			
 
				+void decompress_symbolic_block(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	int xpos,
			
 
				+	int ypos,
			
 
				+	int zpos,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	image_block& blk);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the error between a symbolic block and the original input data.
			
 
				+ *
			
 
				+ * This function is specialized for 2 plane and 1 partition search.
			
 
				+ *
			
 
				+ * In RGBM mode this will reject blocks that attempt to encode a zero M value.
			
 
				+ *
			
 
				+ * @param config   The compressor config.
			
 
				+ * @param bsd      The block size information.
			
 
				+ * @param scb      The symbolic compressed encoding.
			
 
				+ * @param blk      The original image block color data.
			
 
				+ *
			
 
				+ * @return Returns the computed error, or a negative value if the encoding
			
 
				+ *         should be rejected for any reason.
			
 
				+ */
			
 
				+float compute_symbolic_block_difference_2plane(
			
 
				+	const astcenc_config& config,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	const image_block& blk);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the error between a symbolic block and the original input data.
			
 
				+ *
			
 
				+ * This function is specialized for 1 plane and N partition search.
			
 
				+ *
			
 
				+ * In RGBM mode this will reject blocks that attempt to encode a zero M value.
			
 
				+ *
			
 
				+ * @param config   The compressor config.
			
 
				+ * @param bsd      The block size information.
			
 
				+ * @param scb      The symbolic compressed encoding.
			
 
				+ * @param blk      The original image block color data.
			
 
				+ *
			
 
				+ * @return Returns the computed error, or a negative value if the encoding
			
 
				+ *         should be rejected for any reason.
			
 
				+ */
			
 
				+float compute_symbolic_block_difference_1plane(
			
 
				+	const astcenc_config& config,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	const image_block& blk);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the error between a symbolic block and the original input data.
			
 
				+ *
			
 
				+ * This function is specialized for 1 plane and 1 partition search.
			
 
				+ *
			
 
				+ * In RGBM mode this will reject blocks that attempt to encode a zero M value.
			
 
				+ *
			
 
				+ * @param config   The compressor config.
			
 
				+ * @param bsd      The block size information.
			
 
				+ * @param scb      The symbolic compressed encoding.
			
 
				+ * @param blk      The original image block color data.
			
 
				+ *
			
 
				+ * @return Returns the computed error, or a negative value if the encoding
			
 
				+ *         should be rejected for any reason.
			
 
				+ */
			
 
				+float compute_symbolic_block_difference_1plane_1partition(
			
 
				+	const astcenc_config& config,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	const image_block& blk);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Convert a symbolic representation into a binary physical encoding.
			
 
				+ *
			
 
				+ * It is assumed that the symbolic encoding is valid and encodable, or
			
 
				+ * previously flagged as an error block if an error color it to be encoded.
			
 
				+ *
			
 
				+ * @param      bsd   The block size information.
			
 
				+ * @param      scb   The symbolic representation.
			
 
				+ * @param[out] pcb   The binary encoded data.
			
 
				+ */
			
 
				+void symbolic_to_physical(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	physical_compressed_block& pcb);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Convert a binary physical encoding into a symbolic representation.
			
 
				+ *
			
 
				+ * This function can cope with arbitrary input data; output blocks will be
			
 
				+ * flagged as an error block if the encoding is invalid.
			
 
				+ *
			
 
				+ * @param      bsd   The block size information.
			
 
				+ * @param      pcb   The binary encoded data.
			
 
				+ * @param[out] scb   The output symbolic representation.
			
 
				+ */
			
 
				+void physical_to_symbolic(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const physical_compressed_block& pcb,
			
 
				+	symbolic_compressed_block& scb);
			
 
				+
			
 
				+/* ============================================================================
			
 
				+Platform-specific functions.
			
 
				+============================================================================ */
			
 
				+/**
			
 
				+ * @brief Run-time detection if the host CPU supports the POPCNT extension.
			
 
				+ *
			
 
				+ * @return @c true if supported, @c false if not.
			
 
				+ */
			
 
				+bool cpu_supports_popcnt();
			
 
				+
			
 
				+/**
			
 
				+ * @brief Run-time detection if the host CPU supports F16C extension.
			
 
				+ *
			
 
				+ * @return @c true if supported, @c false if not.
			
 
				+ */
			
 
				+bool cpu_supports_f16c();
			
 
				+
			
 
				+/**
			
 
				+ * @brief Run-time detection if the host CPU supports SSE 4.1 extension.
			
 
				+ *
			
 
				+ * @return @c true if supported, @c false if not.
			
 
				+ */
			
 
				+bool cpu_supports_sse41();
			
 
				+
			
 
				+/**
			
 
				+ * @brief Run-time detection if the host CPU supports AVX 2 extension.
			
 
				+ *
			
 
				+ * @return @c true if supported, @c false if not.
			
 
				+ */
			
 
				+bool cpu_supports_avx2();
			
 
				+
			
 
				+/**
			
 
				+ * @brief Allocate an aligned memory buffer.
			
 
				+ *
			
 
				+ * Allocated memory must be freed by aligned_free;
			
 
				+ *
			
 
				+ * @param size    The desired buffer size.
			
 
				+ * @param align   The desired buffer alignment; must be 2^N.
			
 
				+ *
			
 
				+ * @return The memory buffer pointer or nullptr on allocation failure.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+T* aligned_malloc(size_t size, size_t align)
			
 
				+{
			
 
				+	void* ptr;
			
 
				+	int error = 0;
			
 
				+
			
 
				+#if defined(_WIN32)
			
 
				+	ptr = _aligned_malloc(size, align);
			
 
				+#else
			
 
				+	error = posix_memalign(&ptr, align, size);
			
 
				+#endif
			
 
				+
			
 
				+	if (error || (!ptr))
			
 
				+	{
			
 
				+		return nullptr;
			
 
				+	}
			
 
				+
			
 
				+	return static_cast<T*>(ptr);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Free an aligned memory buffer.
			
 
				+ *
			
 
				+ * @param ptr   The buffer to free.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+void aligned_free(T* ptr)
			
 
				+{
			
 
				+#if defined(_WIN32)
			
 
				+	_aligned_free(reinterpret_cast<void*>(ptr));
			
 
				+#else
			
 
				+	free(reinterpret_cast<void*>(ptr));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_internal_entry.h
+++ b/thirdparty/astcenc/astcenc_internal_entry.h
@@ -0,0 +1,273 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions and data declarations for the outer context.
			
 
				+ *
			
 
				+ * The outer context includes thread-pool management, which is slower to
			
 
				+ * compile due to increased use of C++ stdlib. The inner context used in the
			
 
				+ * majority of the codec library does not include this.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
			
 
				+#define ASTCENC_INTERNAL_ENTRY_INCLUDED
			
 
				+
			
 
				+#include <atomic>
			
 
				+#include <condition_variable>
			
 
				+#include <functional>
			
 
				+#include <mutex>
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Parallel execution control
			
 
				+============================================================================ */
			
 
				+
			
 
				+/**
			
 
				+ * @brief A simple counter-based manager for parallel task execution.
			
 
				+ *
			
 
				+ * The task processing execution consists of:
			
 
				+ *
			
 
				+ *     * A single-threaded init stage.
			
 
				+ *     * A multi-threaded processing stage.
			
 
				+ *     * A condition variable so threads can wait for processing completion.
			
 
				+ *
			
 
				+ * The init stage will be executed by the first thread to arrive in the critical section, there is
			
 
				+ * no main thread in the thread pool.
			
 
				+ *
			
 
				+ * The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
			
 
				+ * basis. Threads may each therefore executed different numbers of tasks, depending on their
			
 
				+ * processing complexity. The task queue and the task tickets are just counters; the caller must map
			
 
				+ * these integers to an actual processing partition in a specific problem domain.
			
 
				+ *
			
 
				+ * The exit wait condition is needed to ensure processing has finished before a worker thread can
			
 
				+ * progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
			
 
				+ * because there are no new tasks to assign to it while other worker threads are still processing.
			
 
				+ * Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
			
 
				+ *
			
 
				+ * The basic usage model:
			
 
				+ *
			
 
				+ *     // --------- From single-threaded code ---------
			
 
				+ *
			
 
				+ *     // Reset the tracker state
			
 
				+ *     manager->reset()
			
 
				+ *
			
 
				+ *     // --------- From multi-threaded code ---------
			
 
				+ *
			
 
				+ *     // Run the stage init; only first thread actually runs the lambda
			
 
				+ *     manager->init(<lambda>)
			
 
				+ *
			
 
				+ *     do
			
 
				+ *     {
			
 
				+ *         // Request a task assignment
			
 
				+ *         uint task_count;
			
 
				+ *         uint base_index = manager->get_tasks(<granule>, task_count);
			
 
				+ *
			
 
				+ *         // Process any tasks we were given (task_count <= granule size)
			
 
				+ *         if (task_count)
			
 
				+ *         {
			
 
				+ *             // Run the user task processing code for N tasks here
			
 
				+ *             ...
			
 
				+ *
			
 
				+ *             // Flag these tasks as complete
			
 
				+ *             manager->complete_tasks(task_count);
			
 
				+ *         }
			
 
				+ *     } while (task_count);
			
 
				+ *
			
 
				+ *     // Wait for all threads to complete tasks before progressing
			
 
				+ *     manager->wait()
			
 
				+ *
			
 
				+  *     // Run the stage term; only first thread actually runs the lambda
			
 
				+ *     manager->term(<lambda>)
			
 
				+ */
			
 
				+class ParallelManager
			
 
				+{
			
 
				+private:
			
 
				+	/** @brief Lock used for critical section and condition synchronization. */
			
 
				+	std::mutex m_lock;
			
 
				+
			
 
				+	/** @brief True if the stage init() step has been executed. */
			
 
				+	bool m_init_done;
			
 
				+
			
 
				+	/** @brief True if the stage term() step has been executed. */
			
 
				+	bool m_term_done;
			
 
				+
			
 
				+	/** @brief Condition variable for tracking stage processing completion. */
			
 
				+	std::condition_variable m_complete;
			
 
				+
			
 
				+	/** @brief Number of tasks started, but not necessarily finished. */
			
 
				+	std::atomic<unsigned int> m_start_count;
			
 
				+
			
 
				+	/** @brief Number of tasks finished. */
			
 
				+	unsigned int m_done_count;
			
 
				+
			
 
				+	/** @brief Number of tasks that need to be processed. */
			
 
				+	unsigned int m_task_count;
			
 
				+
			
 
				+public:
			
 
				+	/** @brief Create a new ParallelManager. */
			
 
				+	ParallelManager()
			
 
				+	{
			
 
				+		reset();
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Reset the tracker for a new processing batch.
			
 
				+	 *
			
 
				+	 * This must be called from single-threaded code before starting the multi-threaded processing
			
 
				+	 * operations.
			
 
				+	 */
			
 
				+	void reset()
			
 
				+	{
			
 
				+		m_init_done = false;
			
 
				+		m_term_done = false;
			
 
				+		m_start_count = 0;
			
 
				+		m_done_count = 0;
			
 
				+		m_task_count = 0;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Trigger the pipeline stage init step.
			
 
				+	 *
			
 
				+	 * This can be called from multi-threaded code. The first thread to hit this will process the
			
 
				+	 * initialization. Other threads will block and wait for it to complete.
			
 
				+	 *
			
 
				+	 * @param init_func   Callable which executes the stage initialization. It must return the
			
 
				+	 *                    total number of tasks in the stage.
			
 
				+	 */
			
 
				+	void init(std::function<unsigned int(void)> init_func)
			
 
				+	{
			
 
				+		std::lock_guard<std::mutex> lck(m_lock);
			
 
				+		if (!m_init_done)
			
 
				+		{
			
 
				+			m_task_count = init_func();
			
 
				+			m_init_done = true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Trigger the pipeline stage init step.
			
 
				+	 *
			
 
				+	 * This can be called from multi-threaded code. The first thread to hit this will process the
			
 
				+	 * initialization. Other threads will block and wait for it to complete.
			
 
				+	 *
			
 
				+	 * @param task_count   Total number of tasks needing processing.
			
 
				+	 */
			
 
				+	void init(unsigned int task_count)
			
 
				+	{
			
 
				+		std::lock_guard<std::mutex> lck(m_lock);
			
 
				+		if (!m_init_done)
			
 
				+		{
			
 
				+			m_task_count = task_count;
			
 
				+			m_init_done = true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Request a task assignment.
			
 
				+	 *
			
 
				+	 * Assign up to @c granule tasks to the caller for processing.
			
 
				+	 *
			
 
				+	 * @param      granule   Maximum number of tasks that can be assigned.
			
 
				+	 * @param[out] count     Actual number of tasks assigned, or zero if no tasks were assigned.
			
 
				+	 *
			
 
				+	 * @return Task index of the first assigned task; assigned tasks increment from this.
			
 
				+	 */
			
 
				+	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
			
 
				+	{
			
 
				+		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
			
 
				+		if (base >= m_task_count)
			
 
				+		{
			
 
				+			count = 0;
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		count = astc::min(m_task_count - base, granule);
			
 
				+		return base;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Complete a task assignment.
			
 
				+	 *
			
 
				+	 * Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
			
 
				+	 * completes the processing of the stage.
			
 
				+	 *
			
 
				+	 * @param count   The number of completed tasks.
			
 
				+	 */
			
 
				+	void complete_task_assignment(unsigned int count)
			
 
				+	{
			
 
				+		// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
			
 
				+		// update here and the wait() for other threads
			
 
				+		std::unique_lock<std::mutex> lck(m_lock);
			
 
				+		this->m_done_count += count;
			
 
				+		if (m_done_count == m_task_count)
			
 
				+		{
			
 
				+			lck.unlock();
			
 
				+			m_complete.notify_all();
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Wait for stage processing to complete.
			
 
				+	 */
			
 
				+	void wait()
			
 
				+	{
			
 
				+		std::unique_lock<std::mutex> lck(m_lock);
			
 
				+		m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Trigger the pipeline stage term step.
			
 
				+	 *
			
 
				+	 * This can be called from multi-threaded code. The first thread to hit this will process the
			
 
				+	 * work pool termination. Caller must have called @c wait() prior to calling this function to
			
 
				+	 * ensure that processing is complete.
			
 
				+	 *
			
 
				+	 * @param term_func   Callable which executes the stage termination.
			
 
				+	 */
			
 
				+	void term(std::function<void(void)> term_func)
			
 
				+	{
			
 
				+		std::lock_guard<std::mutex> lck(m_lock);
			
 
				+		if (!m_term_done)
			
 
				+		{
			
 
				+			term_func();
			
 
				+			m_term_done = true;
			
 
				+		}
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief The astcenc compression context.
			
 
				+ */
			
 
				+struct astcenc_context
			
 
				+{
			
 
				+	/** @brief The context internal state. */
			
 
				+	astcenc_contexti context;
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	/** @brief The parallel manager for averages computation. */
			
 
				+	ParallelManager manage_avg;
			
 
				+
			
 
				+	/** @brief The parallel manager for compression. */
			
 
				+	ParallelManager manage_compress;
			
 
				+#endif
			
 
				+
			
 
				+	/** @brief The parallel manager for decompression. */
			
 
				+	ParallelManager manage_decompress;
			
 
				+};
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_mathlib.cpp
+++ b/thirdparty/astcenc/astcenc_mathlib.cpp
@@ -0,0 +1,48 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#include "astcenc_mathlib.h"
			
 
				+
			
 
				+/**
			
 
				+ * @brief 64-bit rotate left.
			
 
				+ *
			
 
				+ * @param val   The value to rotate.
			
 
				+ * @param count The rotation, in bits.
			
 
				+ */
			
 
				+static inline uint64_t rotl(uint64_t val, int count)
			
 
				+{
			
 
				+	return (val << count) | (val >> (64 - count));
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void astc::rand_init(uint64_t state[2])
			
 
				+{
			
 
				+	state[0] = 0xfaf9e171cea1ec6bULL;
			
 
				+	state[1] = 0xf1b318cc06af5d71ULL;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+uint64_t astc::rand(uint64_t state[2])
			
 
				+{
			
 
				+	uint64_t s0 = state[0];
			
 
				+	uint64_t s1 = state[1];
			
 
				+	uint64_t res = s0 + s1;
			
 
				+	s1 ^= s0;
			
 
				+	state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
			
 
				+	state[1] = rotl(s1, 37);
			
 
				+	return res;
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_mathlib.h
+++ b/thirdparty/astcenc/astcenc_mathlib.h
@@ -0,0 +1,478 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/*
			
 
				+ * This module implements a variety of mathematical data types and library
			
 
				+ * functions used by the codec.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTC_MATHLIB_H_INCLUDED
			
 
				+#define ASTC_MATHLIB_H_INCLUDED
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <cstdint>
			
 
				+#include <cmath>
			
 
				+
			
 
				+#ifndef ASTCENC_POPCNT
			
 
				+  #if defined(__POPCNT__)
			
 
				+    #define ASTCENC_POPCNT 1
			
 
				+  #else
			
 
				+    #define ASTCENC_POPCNT 0
			
 
				+  #endif
			
 
				+#endif
			
 
				+
			
 
				+#ifndef ASTCENC_F16C
			
 
				+  #if defined(__F16C__)
			
 
				+    #define ASTCENC_F16C 1
			
 
				+  #else
			
 
				+    #define ASTCENC_F16C 0
			
 
				+  #endif
			
 
				+#endif
			
 
				+
			
 
				+#ifndef ASTCENC_SSE
			
 
				+  #if defined(__SSE4_2__)
			
 
				+    #define ASTCENC_SSE 42
			
 
				+  #elif defined(__SSE4_1__)
			
 
				+    #define ASTCENC_SSE 41
			
 
				+  #elif defined(__SSE3__)
			
 
				+    #define ASTCENC_SSE 30
			
 
				+  #elif defined(__SSE2__)
			
 
				+    #define ASTCENC_SSE 20
			
 
				+  #else
			
 
				+    #define ASTCENC_SSE 0
			
 
				+  #endif
			
 
				+#endif
			
 
				+
			
 
				+#ifndef ASTCENC_AVX
			
 
				+  #if defined(__AVX2__)
			
 
				+    #define ASTCENC_AVX 2
			
 
				+  #elif defined(__AVX__)
			
 
				+    #define ASTCENC_AVX 1
			
 
				+  #else
			
 
				+    #define ASTCENC_AVX 0
			
 
				+  #endif
			
 
				+#endif
			
 
				+
			
 
				+#ifndef ASTCENC_NEON
			
 
				+  #if defined(__aarch64__)
			
 
				+    #define ASTCENC_NEON 1
			
 
				+  #else
			
 
				+    #define ASTCENC_NEON 0
			
 
				+  #endif
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_AVX
			
 
				+  #define ASTCENC_VECALIGN 32
			
 
				+#else
			
 
				+  #define ASTCENC_VECALIGN 16
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
			
 
				+	#include <immintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Fast math library; note that many of the higher-order functions in this set
			
 
				+  use approximations which are less accurate, but faster, than <cmath> standard
			
 
				+  library equivalents.
			
 
				+
			
 
				+  Note: Many of these are not necessarily faster than simple C versions when
			
 
				+  used on a single scalar value, but are included for testing purposes as most
			
 
				+  have an option based on SSE intrinsics and therefore provide an obvious route
			
 
				+  to future vectorization.
			
 
				+============================================================================ */
			
 
				+
			
 
				+// Union for manipulation of float bit patterns
			
 
				+typedef union
			
 
				+{
			
 
				+	uint32_t u;
			
 
				+	int32_t s;
			
 
				+	float f;
			
 
				+} if32;
			
 
				+
			
 
				+// These are namespaced to avoid colliding with C standard library functions.
			
 
				+namespace astc
			
 
				+{
			
 
				+
			
 
				+static const float PI          = 3.14159265358979323846f;
			
 
				+static const float PI_OVER_TWO = 1.57079632679489661923f;
			
 
				+
			
 
				+/**
			
 
				+ * @brief SP float absolute value.
			
 
				+ *
			
 
				+ * @param v   The value to make absolute.
			
 
				+ *
			
 
				+ * @return The absolute value.
			
 
				+ */
			
 
				+static inline float fabs(float v)
			
 
				+{
			
 
				+	return std::fabs(v);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Test if a float value is a nan.
			
 
				+ *
			
 
				+ * @param v    The value test.
			
 
				+ *
			
 
				+ * @return Zero is not a NaN, non-zero otherwise.
			
 
				+ */
			
 
				+static inline bool isnan(float v)
			
 
				+{
			
 
				+	return v != v;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the minimum of two values.
			
 
				+ *
			
 
				+ * For floats, NaNs are turned into @c q.
			
 
				+ *
			
 
				+ * @param p   The first value to compare.
			
 
				+ * @param q   The second value to compare.
			
 
				+ *
			
 
				+ * @return The smallest value.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+static inline T min(T p, T q)
			
 
				+{
			
 
				+	return p < q ? p : q;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the minimum of three values.
			
 
				+ *
			
 
				+ * For floats, NaNs are turned into @c r.
			
 
				+ *
			
 
				+ * @param p   The first value to compare.
			
 
				+ * @param q   The second value to compare.
			
 
				+ * @param r   The third value to compare.
			
 
				+ *
			
 
				+ * @return The smallest value.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+static inline T min(T p, T q, T r)
			
 
				+{
			
 
				+	return min(min(p, q), r);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the minimum of four values.
			
 
				+ *
			
 
				+ * For floats, NaNs are turned into @c s.
			
 
				+ *
			
 
				+ * @param p   The first value to compare.
			
 
				+ * @param q   The second value to compare.
			
 
				+ * @param r   The third value to compare.
			
 
				+ * @param s   The fourth value to compare.
			
 
				+ *
			
 
				+ * @return The smallest value.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+static inline T min(T p, T q, T r, T s)
			
 
				+{
			
 
				+	return min(min(p, q), min(r, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the maximum of two values.
			
 
				+ *
			
 
				+ * For floats, NaNs are turned into @c q.
			
 
				+ *
			
 
				+ * @param p   The first value to compare.
			
 
				+ * @param q   The second value to compare.
			
 
				+ *
			
 
				+ * @return The largest value.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+static inline T max(T p, T q)
			
 
				+{
			
 
				+	return p > q ? p : q;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the maximum of three values.
			
 
				+ *
			
 
				+ * For floats, NaNs are turned into @c r.
			
 
				+ *
			
 
				+ * @param p   The first value to compare.
			
 
				+ * @param q   The second value to compare.
			
 
				+ * @param r   The third value to compare.
			
 
				+ *
			
 
				+ * @return The largest value.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+static inline T max(T p, T q, T r)
			
 
				+{
			
 
				+	return max(max(p, q), r);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the maximum of four values.
			
 
				+ *
			
 
				+ * For floats, NaNs are turned into @c s.
			
 
				+ *
			
 
				+ * @param p   The first value to compare.
			
 
				+ * @param q   The second value to compare.
			
 
				+ * @param r   The third value to compare.
			
 
				+ * @param s   The fourth value to compare.
			
 
				+ *
			
 
				+ * @return The largest value.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+static inline T max(T p, T q, T r, T s)
			
 
				+{
			
 
				+	return max(max(p, q), max(r, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Clamp a value value between @c mn and @c mx.
			
 
				+ *
			
 
				+ * For floats, NaNs are turned into @c mn.
			
 
				+ *
			
 
				+ * @param v      The value to clamp.
			
 
				+ * @param mn     The min value (inclusive).
			
 
				+ * @param mx     The max value (inclusive).
			
 
				+ *
			
 
				+ * @return The clamped value.
			
 
				+ */
			
 
				+template<typename T>
			
 
				+inline T clamp(T v, T mn, T mx)
			
 
				+{
			
 
				+	// Do not reorder; correct NaN handling relies on the fact that comparison
			
 
				+	// with NaN returns false and will fall-though to the "min" value.
			
 
				+	if (v > mx) return mx;
			
 
				+	if (v > mn) return v;
			
 
				+	return mn;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Clamp a float value between 0.0f and 1.0f.
			
 
				+ *
			
 
				+ * NaNs are turned into 0.0f.
			
 
				+ *
			
 
				+ * @param v   The value to clamp.
			
 
				+ *
			
 
				+ * @return The clamped value.
			
 
				+ */
			
 
				+static inline float clamp1f(float v)
			
 
				+{
			
 
				+	return astc::clamp(v, 0.0f, 1.0f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Clamp a float value between 0.0f and 255.0f.
			
 
				+ *
			
 
				+ * NaNs are turned into 0.0f.
			
 
				+ *
			
 
				+ * @param v  The value to clamp.
			
 
				+ *
			
 
				+ * @return The clamped value.
			
 
				+ */
			
 
				+static inline float clamp255f(float v)
			
 
				+{
			
 
				+	return astc::clamp(v, 0.0f, 255.0f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief SP float round-down.
			
 
				+ *
			
 
				+ * @param v   The value to round.
			
 
				+ *
			
 
				+ * @return The rounded value.
			
 
				+ */
			
 
				+static inline float flt_rd(float v)
			
 
				+{
			
 
				+	return std::floor(v);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief SP float round-to-nearest and convert to integer.
			
 
				+ *
			
 
				+ * @param v   The value to round.
			
 
				+ *
			
 
				+ * @return The rounded value.
			
 
				+ */
			
 
				+static inline int flt2int_rtn(float v)
			
 
				+{
			
 
				+
			
 
				+	return static_cast<int>(v + 0.5f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief SP float round down and convert to integer.
			
 
				+ *
			
 
				+ * @param v   The value to round.
			
 
				+ *
			
 
				+ * @return The rounded value.
			
 
				+ */
			
 
				+static inline int flt2int_rd(float v)
			
 
				+{
			
 
				+	return static_cast<int>(v);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief SP float bit-interpreted as an integer.
			
 
				+ *
			
 
				+ * @param v   The value to bitcast.
			
 
				+ *
			
 
				+ * @return The converted value.
			
 
				+ */
			
 
				+static inline int float_as_int(float v)
			
 
				+{
			
 
				+	union { int a; float b; } u;
			
 
				+	u.b = v;
			
 
				+	return u.a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Integer bit-interpreted as an SP float.
			
 
				+ *
			
 
				+ * @param v   The value to bitcast.
			
 
				+ *
			
 
				+ * @return The converted value.
			
 
				+ */
			
 
				+static inline float int_as_float(int v)
			
 
				+{
			
 
				+	union { int a; float b; } u;
			
 
				+	u.a = v;
			
 
				+	return u.b;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Fast approximation of 1.0 / sqrt(val).
			
 
				+ *
			
 
				+ * @param v   The input value.
			
 
				+ *
			
 
				+ * @return The approximated result.
			
 
				+ */
			
 
				+static inline float rsqrt(float v)
			
 
				+{
			
 
				+	return 1.0f / std::sqrt(v);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Fast approximation of sqrt(val).
			
 
				+ *
			
 
				+ * @param v   The input value.
			
 
				+ *
			
 
				+ * @return The approximated result.
			
 
				+ */
			
 
				+static inline float sqrt(float v)
			
 
				+{
			
 
				+	return std::sqrt(v);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Extract mantissa and exponent of a float value.
			
 
				+ *
			
 
				+ * @param      v      The input value.
			
 
				+ * @param[out] expo   The output exponent.
			
 
				+ *
			
 
				+ * @return The mantissa.
			
 
				+ */
			
 
				+static inline float frexp(float v, int* expo)
			
 
				+{
			
 
				+	if32 p;
			
 
				+	p.f = v;
			
 
				+	*expo = ((p.u >> 23) & 0xFF) - 126;
			
 
				+	p.u = (p.u & 0x807fffff) | 0x3f000000;
			
 
				+	return p.f;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Initialize the seed structure for a random number generator.
			
 
				+ *
			
 
				+ * Important note: For the purposes of ASTC we want sets of random numbers to
			
 
				+ * use the codec, but we want the same seed value across instances and threads
			
 
				+ * to ensure that image output is stable across compressor runs and across
			
 
				+ * platforms. Every PRNG created by this call will therefore return the same
			
 
				+ * sequence of values ...
			
 
				+ *
			
 
				+ * @param state The state structure to initialize.
			
 
				+ */
			
 
				+void rand_init(uint64_t state[2]);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the next random number from the generator.
			
 
				+ *
			
 
				+ * This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
			
 
				+ * public-domain implementation given by David Blackman & Sebastiano Vigna at
			
 
				+ * http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
			
 
				+ *
			
 
				+ * @param state The state structure to use/update.
			
 
				+ */
			
 
				+uint64_t rand(uint64_t state[2]);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+/* ============================================================================
			
 
				+  Softfloat library with fp32 and fp16 conversion functionality.
			
 
				+============================================================================ */
			
 
				+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
			
 
				+	/* narrowing float->float conversions */
			
 
				+	uint16_t float_to_sf16(float val);
			
 
				+	float sf16_to_float(uint16_t val);
			
 
				+#endif
			
 
				+
			
 
				+/*********************************
			
 
				+  Vector library
			
 
				+*********************************/
			
 
				+#include "astcenc_vecmathlib.h"
			
 
				+
			
 
				+/*********************************
			
 
				+  Declaration of line types
			
 
				+*********************************/
			
 
				+// parametric line, 2D: The line is given by line = a + b * t.
			
 
				+
			
 
				+struct line2
			
 
				+{
			
 
				+	vfloat4 a;
			
 
				+	vfloat4 b;
			
 
				+};
			
 
				+
			
 
				+// parametric line, 3D
			
 
				+struct line3
			
 
				+{
			
 
				+	vfloat4 a;
			
 
				+	vfloat4 b;
			
 
				+};
			
 
				+
			
 
				+struct line4
			
 
				+{
			
 
				+	vfloat4 a;
			
 
				+	vfloat4 b;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+struct processed_line2
			
 
				+{
			
 
				+	vfloat4 amod;
			
 
				+	vfloat4 bs;
			
 
				+};
			
 
				+
			
 
				+struct processed_line3
			
 
				+{
			
 
				+	vfloat4 amod;
			
 
				+	vfloat4 bs;
			
 
				+};
			
 
				+
			
 
				+struct processed_line4
			
 
				+{
			
 
				+	vfloat4 amod;
			
 
				+	vfloat4 bs;
			
 
				+};
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
+++ b/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
@@ -0,0 +1,411 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Soft-float library for IEEE-754.
			
 
				+ */
			
 
				+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
			
 
				+
			
 
				+#include "astcenc_mathlib.h"
			
 
				+
			
 
				+/*	sized soft-float types. These are mapped to the sized integer
			
 
				+    types of C99, instead of C's floating-point types; this is because
			
 
				+    the library needs to maintain exact, bit-level control on all
			
 
				+    operations on these data types. */
			
 
				+typedef uint16_t sf16;
			
 
				+typedef uint32_t sf32;
			
 
				+
			
 
				+/******************************************
			
 
				+  helper functions and their lookup tables
			
 
				+ ******************************************/
			
 
				+/* count leading zeros functions. Only used when the input is nonzero. */
			
 
				+
			
 
				+#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
			
 
				+#elif defined(__arm__) && defined(__ARMCC_VERSION)
			
 
				+#elif defined(__arm__) && defined(__GNUC__)
			
 
				+#else
			
 
				+	/* table used for the slow default versions. */
			
 
				+	static const uint8_t clz_table[256] =
			
 
				+	{
			
 
				+		8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
			
 
				+		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
			
 
				+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
			
 
				+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
			
 
				+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
			
 
				+	};
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+   32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
			
 
				+static uint32_t clz32(uint32_t inp)
			
 
				+{
			
 
				+	#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
			
 
				+		uint32_t bsr;
			
 
				+		__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
			
 
				+		return 31 - bsr;
			
 
				+	#else
			
 
				+		#if defined(__arm__) && defined(__ARMCC_VERSION)
			
 
				+			return __clz(inp);			/* armcc builtin */
			
 
				+		#else
			
 
				+			#if defined(__arm__) && defined(__GNUC__)
			
 
				+				uint32_t lz;
			
 
				+				__asm__("clz %0, %1": "=r"(lz):"r"(inp));
			
 
				+				return lz;
			
 
				+			#else
			
 
				+				/* slow default version */
			
 
				+				uint32_t summa = 24;
			
 
				+				if (inp >= UINT32_C(0x10000))
			
 
				+				{
			
 
				+					inp >>= 16;
			
 
				+					summa -= 16;
			
 
				+				}
			
 
				+				if (inp >= UINT32_C(0x100))
			
 
				+				{
			
 
				+					inp >>= 8;
			
 
				+					summa -= 8;
			
 
				+				}
			
 
				+				return summa + clz_table[inp];
			
 
				+			#endif
			
 
				+		#endif
			
 
				+	#endif
			
 
				+}
			
 
				+
			
 
				+/* the five rounding modes that IEEE-754r defines */
			
 
				+typedef enum
			
 
				+{
			
 
				+	SF_UP = 0,				/* round towards positive infinity */
			
 
				+	SF_DOWN = 1,			/* round towards negative infinity */
			
 
				+	SF_TOZERO = 2,			/* round towards zero */
			
 
				+	SF_NEARESTEVEN = 3,		/* round toward nearest value; if mid-between, round to even value */
			
 
				+	SF_NEARESTAWAY = 4		/* round toward nearest value; if mid-between, round away from zero */
			
 
				+} roundmode;
			
 
				+
			
 
				+
			
 
				+static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
			
 
				+{
			
 
				+	uint32_t vl1 = UINT32_C(1) << shamt;
			
 
				+	uint32_t inp2 = inp + (vl1 >> 1);	/* added 0.5 ULP */
			
 
				+	uint32_t msk = (inp | UINT32_C(1)) & vl1;	/* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
			
 
				+	msk--;						/* negative if even, nonnegative if odd. */
			
 
				+	inp2 -= (msk >> 31);		/* subtract epsilon before shift if even. */
			
 
				+	inp2 >>= shamt;
			
 
				+	return inp2;
			
 
				+}
			
 
				+
			
 
				+static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
			
 
				+{
			
 
				+	uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
			
 
				+	inp += vl1;
			
 
				+	inp >>= shamt;
			
 
				+	return inp;
			
 
				+}
			
 
				+
			
 
				+static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
			
 
				+{
			
 
				+	uint32_t vl1 = UINT32_C(1) << shamt;
			
 
				+	inp += vl1;
			
 
				+	inp--;
			
 
				+	inp >>= shamt;
			
 
				+	return inp;
			
 
				+}
			
 
				+
			
 
				+/* convert from FP16 to FP32. */
			
 
				+static sf32 sf16_to_sf32(sf16 inp)
			
 
				+{
			
 
				+	uint32_t inpx = inp;
			
 
				+
			
 
				+	/*
			
 
				+		This table contains, for every FP16 sign/exponent value combination,
			
 
				+		the difference between the input FP16 value and the value obtained
			
 
				+		by shifting the correct FP32 result right by 13 bits.
			
 
				+		This table allows us to handle every case except denormals and NaN
			
 
				+		with just 1 table lookup, 2 shifts and 1 add.
			
 
				+	*/
			
 
				+
			
 
				+	#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
			
 
				+	static const uint32_t tbl[64] =
			
 
				+	{
			
 
				+		WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
			
 
				+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
			
 
				+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
			
 
				+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
			
 
				+		WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
			
 
				+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
			
 
				+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
			
 
				+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
			
 
				+	};
			
 
				+
			
 
				+	uint32_t res = tbl[inpx >> 10];
			
 
				+	res += inpx;
			
 
				+
			
 
				+	/* Normal cases: MSB of 'res' not set. */
			
 
				+	if ((res & WITH_MSB(0)) == 0)
			
 
				+	{
			
 
				+		return res << 13;
			
 
				+	}
			
 
				+
			
 
				+	/* Infinity and Zero: 10 LSB of 'res' not set. */
			
 
				+	if ((res & 0x3FF) == 0)
			
 
				+	{
			
 
				+		return res << 13;
			
 
				+	}
			
 
				+
			
 
				+	/* NaN: the exponent field of 'inp' is non-zero. */
			
 
				+	if ((inpx & 0x7C00) != 0)
			
 
				+	{
			
 
				+		/* All NaNs are quietened. */
			
 
				+		return (res << 13) | 0x400000;
			
 
				+	}
			
 
				+
			
 
				+	/* Denormal cases */
			
 
				+	uint32_t sign = (inpx & 0x8000) << 16;
			
 
				+	uint32_t mskval = inpx & 0x7FFF;
			
 
				+	uint32_t leadingzeroes = clz32(mskval);
			
 
				+	mskval <<= leadingzeroes;
			
 
				+	return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
			
 
				+}
			
 
				+
			
 
				+/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
			
 
				+static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
			
 
				+{
			
 
				+	/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
			
 
				+	static const uint8_t tab[512] {
			
 
				+		0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
			
 
				+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
			
 
				+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
			
 
				+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
			
 
				+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
			
 
				+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
			
 
				+		10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+		20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
			
 
				+		30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
			
 
				+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
			
 
				+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
			
 
				+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
			
 
				+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
			
 
				+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
			
 
				+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
			
 
				+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
			
 
				+
			
 
				+		5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+		15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
			
 
				+		25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
			
 
				+		35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
			
 
				+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
			
 
				+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
			
 
				+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
			
 
				+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
			
 
				+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
			
 
				+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
			
 
				+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
			
 
				+	};
			
 
				+
			
 
				+	/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
			
 
				+	   size. */
			
 
				+	static const uint32_t tabx[60] {
			
 
				+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
			
 
				+		UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
			
 
				+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
			
 
				+		UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
			
 
				+		UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
			
 
				+		UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
			
 
				+		UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
			
 
				+		UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
			
 
				+		UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
			
 
				+	};
			
 
				+
			
 
				+	uint32_t p;
			
 
				+	uint32_t idx = rmode + tab[inp >> 23];
			
 
				+	uint32_t vlx = tabx[idx];
			
 
				+	switch (idx)
			
 
				+	{
			
 
				+		/*
			
 
				+			Positive number which may be Infinity or NaN.
			
 
				+			We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
			
 
				+			(If we don't do this quieting, then a NaN  that is distinguished only by having
			
 
				+			its low-order bits set, would be turned into an INF. */
			
 
				+	case 50:
			
 
				+	case 51:
			
 
				+	case 52:
			
 
				+	case 53:
			
 
				+	case 54:
			
 
				+	case 55:
			
 
				+	case 56:
			
 
				+	case 57:
			
 
				+	case 58:
			
 
				+	case 59:
			
 
				+		/*
			
 
				+			the input value is 0x7F800000 or 0xFF800000 if it is INF.
			
 
				+			By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
			
 
				+			For NaNs, however, this operation will keep bit 23 with the value 1.
			
 
				+			We can then extract bit 23, and logical-OR bit 9 of the result with this
			
 
				+			bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
			
 
				+			of the mantissa is set.)
			
 
				+		*/
			
 
				+		p = (inp - 1) & UINT32_C(0x800000);	/* zero if INF, nonzero if NaN. */
			
 
				+		return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
			
 
				+		/*
			
 
				+			positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
			
 
				+			If it is, then return 0, else return 1 (the smallest representable nonzero number)
			
 
				+		*/
			
 
				+	case 0:
			
 
				+		/*
			
 
				+			-inp will set the MSB if the input number is nonzero.
			
 
				+			Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
			
 
				+		*/
			
 
				+		return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
			
 
				+
			
 
				+		/*
			
 
				+			negative, exponent = , round-mode == DOWN, need to check whether number is
			
 
				+			actually 0. If it is, return 0x8000 ( float -0.0 )
			
 
				+			Else return the smallest negative number ( 0x8001 ) */
			
 
				+	case 6:
			
 
				+		/*
			
 
				+			in this case 'vlx' is 0x80000000. By subtracting the input value from it,
			
 
				+			we obtain a value that is 0 if the input value is in fact zero and has
			
 
				+			the MSB set if it isn't. We then right-shift the value by 31 places to
			
 
				+			get a value that is 0 if the input is -0.0 and 1 otherwise.
			
 
				+		*/
			
 
				+		return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
			
 
				+
			
 
				+		/*
			
 
				+			for all other cases involving underflow/overflow, we don't need to
			
 
				+			do actual tests; we just return 'vlx'.
			
 
				+		*/
			
 
				+	case 1:
			
 
				+	case 2:
			
 
				+	case 3:
			
 
				+	case 4:
			
 
				+	case 5:
			
 
				+	case 7:
			
 
				+	case 8:
			
 
				+	case 9:
			
 
				+	case 10:
			
 
				+	case 11:
			
 
				+	case 12:
			
 
				+	case 13:
			
 
				+	case 14:
			
 
				+	case 15:
			
 
				+	case 16:
			
 
				+	case 17:
			
 
				+	case 18:
			
 
				+	case 19:
			
 
				+	case 40:
			
 
				+	case 41:
			
 
				+	case 42:
			
 
				+	case 43:
			
 
				+	case 44:
			
 
				+	case 45:
			
 
				+	case 46:
			
 
				+	case 47:
			
 
				+	case 48:
			
 
				+	case 49:
			
 
				+		return static_cast<sf16>(vlx);
			
 
				+
			
 
				+		/*
			
 
				+			for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
			
 
				+			FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
			
 
				+			baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
			
 
				+			from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
			
 
				+			for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
			
 
				+			except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
			
 
				+
			
 
				+		/* normal number, all rounding modes except round-to-nearest-even: */
			
 
				+	case 30:
			
 
				+	case 31:
			
 
				+	case 32:
			
 
				+	case 34:
			
 
				+	case 35:
			
 
				+	case 36:
			
 
				+	case 37:
			
 
				+	case 39:
			
 
				+		return static_cast<sf16>((inp + vlx) >> 13);
			
 
				+
			
 
				+		/* normal number, round-to-nearest-even. */
			
 
				+	case 33:
			
 
				+	case 38:
			
 
				+		p = inp + vlx;
			
 
				+		p += (inp >> 13) & 1;
			
 
				+		return static_cast<sf16>(p >> 13);
			
 
				+
			
 
				+		/*
			
 
				+			the various denormal cases. These are not expected to be common, so their performance is a bit
			
 
				+			less important. For each of these cases, we need to extract an exponent and a mantissa
			
 
				+			(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
			
 
				+			depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
			
 
				+			sign of the resulting denormal number.
			
 
				+		*/
			
 
				+	case 21:
			
 
				+	case 22:
			
 
				+	case 25:
			
 
				+	case 27:
			
 
				+		/* denormal, round towards zero. */
			
 
				+		p = 126 - ((inp >> 23) & 0xFF);
			
 
				+		return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
			
 
				+	case 20:
			
 
				+	case 26:
			
 
				+		/* denormal, round away from zero. */
			
 
				+		p = 126 - ((inp >> 23) & 0xFF);
			
 
				+		return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
			
 
				+	case 24:
			
 
				+	case 29:
			
 
				+		/* denormal, round to nearest-away */
			
 
				+		p = 126 - ((inp >> 23) & 0xFF);
			
 
				+		return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
			
 
				+	case 23:
			
 
				+	case 28:
			
 
				+		/* denormal, round to nearest-even. */
			
 
				+		p = 126 - ((inp >> 23) & 0xFF);
			
 
				+		return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* convert from soft-float to native-float */
			
 
				+float sf16_to_float(uint16_t p)
			
 
				+{
			
 
				+	if32 i;
			
 
				+	i.u = sf16_to_sf32(p);
			
 
				+	return i.f;
			
 
				+}
			
 
				+
			
 
				+/* convert from native-float to soft-float */
			
 
				+uint16_t float_to_sf16(float p)
			
 
				+{
			
 
				+	if32 i;
			
 
				+	i.f = p;
			
 
				+	return sf32_to_sf16(i.u, SF_NEARESTEVEN);
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_partition_tables.cpp
+++ b/thirdparty/astcenc/astcenc_partition_tables.cpp
@@ -0,0 +1,481 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for generating partition tables on demand.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
			
 
				+#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Generate a canonical representation of a partition pattern.
			
 
				+ *
			
 
				+ * The returned value stores two bits per texel, for up to 6x6x6 texels, where the two bits store
			
 
				+ * the remapped texel index. Remapping ensures that we only match on the partition pattern,
			
 
				+ * independent of the partition order generated by the hash.
			
 
				+ *
			
 
				+ * @param      texel_count          The number of texels in the block.
			
 
				+ * @param      partition_of_texel   The partition assignments, in hash order.
			
 
				+ * @param[out] bit_pattern          The output bit pattern representation.
			
 
				+ */
			
 
				+static void generate_canonical_partitioning(
			
 
				+	unsigned int texel_count,
			
 
				+	const uint8_t* partition_of_texel,
			
 
				+	uint64_t bit_pattern[BIT_PATTERN_WORDS]
			
 
				+) {
			
 
				+	// Clear the pattern
			
 
				+	for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
			
 
				+	{
			
 
				+		bit_pattern[i] = 0;
			
 
				+	}
			
 
				+
			
 
				+	// Store a mapping to reorder the raw partitions so that the partitions are ordered such
			
 
				+	// that the lowest texel index in partition N is smaller than the lowest texel index in
			
 
				+	// partition N + 1.
			
 
				+	int mapped_index[BLOCK_MAX_PARTITIONS];
			
 
				+	int map_weight_count = 0;
			
 
				+
			
 
				+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
			
 
				+	{
			
 
				+		mapped_index[i] = -1;
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int i = 0; i < texel_count; i++)
			
 
				+	{
			
 
				+		int index = partition_of_texel[i];
			
 
				+		if (mapped_index[index] < 0)
			
 
				+		{
			
 
				+			mapped_index[index] = map_weight_count++;
			
 
				+		}
			
 
				+
			
 
				+		uint64_t xlat_index = mapped_index[index];
			
 
				+		bit_pattern[i >> 5] |= xlat_index << (2 * (i & 0x1F));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compare two canonical patterns to see if they are the same.
			
 
				+ *
			
 
				+ * @param part1   The first canonical bit pattern to check.
			
 
				+ * @param part2   The second canonical bit pattern to check.
			
 
				+ *
			
 
				+ * @return @c true if the patterns are the same, @c false otherwise.
			
 
				+ */
			
 
				+static bool compare_canonical_partitionings(
			
 
				+	const uint64_t part1[BIT_PATTERN_WORDS],
			
 
				+	const uint64_t part2[BIT_PATTERN_WORDS]
			
 
				+) {
			
 
				+	return (part1[0] == part2[0])
			
 
				+#if BIT_PATTERN_WORDS > 1
			
 
				+	    && (part1[1] == part2[1])
			
 
				+#endif
			
 
				+#if BIT_PATTERN_WORDS > 2
			
 
				+	    && (part1[2] == part2[2])
			
 
				+#endif
			
 
				+#if BIT_PATTERN_WORDS > 3
			
 
				+	    && (part1[3] == part2[3])
			
 
				+#endif
			
 
				+#if BIT_PATTERN_WORDS > 4
			
 
				+	    && (part1[4] == part2[4])
			
 
				+#endif
			
 
				+#if BIT_PATTERN_WORDS > 5
			
 
				+	    && (part1[5] == part2[5])
			
 
				+#endif
			
 
				+#if BIT_PATTERN_WORDS > 6
			
 
				+	    && (part1[6] == part2[6])
			
 
				+#endif
			
 
				+	    ;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Hash function used for procedural partition assignment.
			
 
				+ *
			
 
				+ * @param inp   The hash seed.
			
 
				+ *
			
 
				+ * @return The hashed value.
			
 
				+ */
			
 
				+static uint32_t hash52(
			
 
				+	uint32_t inp
			
 
				+) {
			
 
				+	inp ^= inp >> 15;
			
 
				+
			
 
				+	// (2^4 + 1) * (2^7 + 1) * (2^17 - 1)
			
 
				+	inp *= 0xEEDE0891;
			
 
				+	inp ^= inp >> 5;
			
 
				+	inp += inp << 16;
			
 
				+	inp ^= inp >> 7;
			
 
				+	inp ^= inp >> 3;
			
 
				+	inp ^= inp << 6;
			
 
				+	inp ^= inp >> 17;
			
 
				+	return inp;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Select texel assignment for a single coordinate.
			
 
				+ *
			
 
				+ * @param seed              The seed - the partition index from the block.
			
 
				+ * @param x                 The texel X coordinate in the block.
			
 
				+ * @param y                 The texel Y coordinate in the block.
			
 
				+ * @param z                 The texel Z coordinate in the block.
			
 
				+ * @param partition_count   The total partition count of this encoding.
			
 
				+ * @param small_block       @c true if the block has fewer than 32 texels.
			
 
				+ *
			
 
				+ * @return The assigned partition index for this texel.
			
 
				+ */
			
 
				+static uint8_t select_partition(
			
 
				+	int seed,
			
 
				+	int x,
			
 
				+	int y,
			
 
				+	int z,
			
 
				+	int partition_count,
			
 
				+	bool small_block
			
 
				+) {
			
 
				+	// For small blocks bias the coordinates to get better distribution
			
 
				+	if (small_block)
			
 
				+	{
			
 
				+		x <<= 1;
			
 
				+		y <<= 1;
			
 
				+		z <<= 1;
			
 
				+	}
			
 
				+
			
 
				+	seed += (partition_count - 1) * 1024;
			
 
				+
			
 
				+	uint32_t rnum = hash52(seed);
			
 
				+
			
 
				+	uint8_t seed1 = rnum & 0xF;
			
 
				+	uint8_t seed2 = (rnum >> 4) & 0xF;
			
 
				+	uint8_t seed3 = (rnum >> 8) & 0xF;
			
 
				+	uint8_t seed4 = (rnum >> 12) & 0xF;
			
 
				+	uint8_t seed5 = (rnum >> 16) & 0xF;
			
 
				+	uint8_t seed6 = (rnum >> 20) & 0xF;
			
 
				+	uint8_t seed7 = (rnum >> 24) & 0xF;
			
 
				+	uint8_t seed8 = (rnum >> 28) & 0xF;
			
 
				+	uint8_t seed9 = (rnum >> 18) & 0xF;
			
 
				+	uint8_t seed10 = (rnum >> 22) & 0xF;
			
 
				+	uint8_t seed11 = (rnum >> 26) & 0xF;
			
 
				+	uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
			
 
				+
			
 
				+	// Squaring all the seeds in order to bias their distribution towards lower values.
			
 
				+	seed1 *= seed1;
			
 
				+	seed2 *= seed2;
			
 
				+	seed3 *= seed3;
			
 
				+	seed4 *= seed4;
			
 
				+	seed5 *= seed5;
			
 
				+	seed6 *= seed6;
			
 
				+	seed7 *= seed7;
			
 
				+	seed8 *= seed8;
			
 
				+	seed9 *= seed9;
			
 
				+	seed10 *= seed10;
			
 
				+	seed11 *= seed11;
			
 
				+	seed12 *= seed12;
			
 
				+
			
 
				+	int sh1, sh2;
			
 
				+	if (seed & 1)
			
 
				+	{
			
 
				+		sh1 = (seed & 2 ? 4 : 5);
			
 
				+		sh2 = (partition_count == 3 ? 6 : 5);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		sh1 = (partition_count == 3 ? 6 : 5);
			
 
				+		sh2 = (seed & 2 ? 4 : 5);
			
 
				+	}
			
 
				+
			
 
				+	int sh3 = (seed & 0x10) ? sh1 : sh2;
			
 
				+
			
 
				+	seed1 >>= sh1;
			
 
				+	seed2 >>= sh2;
			
 
				+	seed3 >>= sh1;
			
 
				+	seed4 >>= sh2;
			
 
				+	seed5 >>= sh1;
			
 
				+	seed6 >>= sh2;
			
 
				+	seed7 >>= sh1;
			
 
				+	seed8 >>= sh2;
			
 
				+
			
 
				+	seed9 >>= sh3;
			
 
				+	seed10 >>= sh3;
			
 
				+	seed11 >>= sh3;
			
 
				+	seed12 >>= sh3;
			
 
				+
			
 
				+	int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
			
 
				+	int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
			
 
				+	int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
			
 
				+	int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
			
 
				+
			
 
				+	// Apply the saw
			
 
				+	a &= 0x3F;
			
 
				+	b &= 0x3F;
			
 
				+	c &= 0x3F;
			
 
				+	d &= 0x3F;
			
 
				+
			
 
				+	// Remove some of the components if we are to output < 4 partitions.
			
 
				+	if (partition_count <= 3)
			
 
				+	{
			
 
				+		d = 0;
			
 
				+	}
			
 
				+
			
 
				+	if (partition_count <= 2)
			
 
				+	{
			
 
				+		c = 0;
			
 
				+	}
			
 
				+
			
 
				+	if (partition_count <= 1)
			
 
				+	{
			
 
				+		b = 0;
			
 
				+	}
			
 
				+
			
 
				+	uint8_t partition;
			
 
				+	if (a >= b && a >= c && a >= d)
			
 
				+	{
			
 
				+		partition = 0;
			
 
				+	}
			
 
				+	else if (b >= c && b >= d)
			
 
				+	{
			
 
				+		partition = 1;
			
 
				+	}
			
 
				+	else if (c >= d)
			
 
				+	{
			
 
				+		partition = 2;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		partition = 3;
			
 
				+	}
			
 
				+
			
 
				+	return partition;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Generate a single partition info structure.
			
 
				+ *
			
 
				+ * @param[out] bsd                     The block size information.
			
 
				+ * @param      partition_count         The partition count of this partitioning.
			
 
				+ * @param      partition_index         The partition index / seed of this partitioning.
			
 
				+ * @param      partition_remap_index   The remapped partition index of this partitioning.
			
 
				+ * @param[out] pi                      The partition info structure to populate.
			
 
				+ *
			
 
				+ * @return True if this is a useful partition index, False if we can skip it.
			
 
				+ */
			
 
				+static bool generate_one_partition_info_entry(
			
 
				+	block_size_descriptor& bsd,
			
 
				+	unsigned int partition_count,
			
 
				+	unsigned int partition_index,
			
 
				+	unsigned int partition_remap_index,
			
 
				+	partition_info& pi
			
 
				+) {
			
 
				+	int texels_per_block = bsd.texel_count;
			
 
				+	bool small_block = texels_per_block < 32;
			
 
				+
			
 
				+	uint8_t *partition_of_texel = pi.partition_of_texel;
			
 
				+
			
 
				+	// Assign texels to partitions
			
 
				+	int texel_idx = 0;
			
 
				+	int counts[BLOCK_MAX_PARTITIONS] { 0 };
			
 
				+	for (unsigned int z = 0; z < bsd.zdim; z++)
			
 
				+	{
			
 
				+		for (unsigned int y = 0; y <  bsd.ydim; y++)
			
 
				+		{
			
 
				+			for (unsigned int x = 0; x <  bsd.xdim; x++)
			
 
				+			{
			
 
				+				uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
			
 
				+				pi.texels_of_partition[part][counts[part]++] = static_cast<uint8_t>(texel_idx++);
			
 
				+				*partition_of_texel++ = part;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Fill loop tail so we can overfetch later
			
 
				+	for (unsigned int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		int ptex_count = counts[i];
			
 
				+		int ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
			
 
				+		for (int j = ptex_count; j < ptex_count_simd; j++)
			
 
				+		{
			
 
				+			pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Populate the actual procedural partition count
			
 
				+	if (counts[0] == 0)
			
 
				+	{
			
 
				+		pi.partition_count = 0;
			
 
				+	}
			
 
				+	else if (counts[1] == 0)
			
 
				+	{
			
 
				+		pi.partition_count = 1;
			
 
				+	}
			
 
				+	else if (counts[2] == 0)
			
 
				+	{
			
 
				+		pi.partition_count = 2;
			
 
				+	}
			
 
				+	else if (counts[3] == 0)
			
 
				+	{
			
 
				+		pi.partition_count = 3;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		pi.partition_count = 4;
			
 
				+	}
			
 
				+
			
 
				+	// Populate the partition index
			
 
				+	pi.partition_index = static_cast<uint16_t>(partition_index);
			
 
				+
			
 
				+	// Populate the coverage bitmaps for 2/3/4 partitions
			
 
				+	uint64_t* bitmaps { nullptr };
			
 
				+	if (partition_count == 2)
			
 
				+	{
			
 
				+		bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
			
 
				+	}
			
 
				+	else if (partition_count == 3)
			
 
				+	{
			
 
				+		bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
			
 
				+	}
			
 
				+	else if (partition_count == 4)
			
 
				+	{
			
 
				+		bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
			
 
				+	{
			
 
				+		pi.partition_texel_count[i] = static_cast<uint8_t>(counts[i]);
			
 
				+	}
			
 
				+
			
 
				+	// Valid partitionings have texels in all of the requested partitions
			
 
				+	bool valid = pi.partition_count == partition_count;
			
 
				+
			
 
				+	if (bitmaps)
			
 
				+	{
			
 
				+		// Populate the partition coverage bitmap
			
 
				+		for (unsigned int i = 0; i < partition_count; i++)
			
 
				+		{
			
 
				+			bitmaps[i] = 0ULL;
			
 
				+		}
			
 
				+
			
 
				+		unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
			
 
				+		for (unsigned int i = 0; i < texels_to_process; i++)
			
 
				+		{
			
 
				+			unsigned int idx = bsd.kmeans_texels[i];
			
 
				+			bitmaps[pi.partition_of_texel[idx]] |= 1ULL << i;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return valid;
			
 
				+}
			
 
				+
			
 
				+static void build_partition_table_for_one_partition_count(
			
 
				+	block_size_descriptor& bsd,
			
 
				+	bool can_omit_partitionings,
			
 
				+	unsigned int partition_count_cutoff,
			
 
				+	unsigned int partition_count,
			
 
				+	partition_info* ptab,
			
 
				+	uint64_t* canonical_patterns
			
 
				+) {
			
 
				+	unsigned int next_index = 0;
			
 
				+	bsd.partitioning_count_selected[partition_count - 1] = 0;
			
 
				+	bsd.partitioning_count_all[partition_count - 1] = 0;
			
 
				+
			
 
				+	// Skip tables larger than config max partition count if we can omit modes
			
 
				+	if (can_omit_partitionings && (partition_count > partition_count_cutoff))
			
 
				+	{
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Iterate through twice
			
 
				+	//   - Pass 0: Keep selected partitionings
			
 
				+	//   - Pass 1: Keep non-selected partitionings (skip if in omit mode)
			
 
				+	unsigned int max_iter = can_omit_partitionings ? 1 : 2;
			
 
				+
			
 
				+	// Tracker for things we built in the first iteration
			
 
				+	uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
			
 
				+	for (unsigned int x = 0; x < max_iter; x++)
			
 
				+	{
			
 
				+		for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
			
 
				+		{
			
 
				+			// Don't include things we built in the first pass
			
 
				+			if ((x == 1) && build[i])
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]);
			
 
				+			if ((x == 0) && !keep_useful)
			
 
				+			{
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
			
 
				+			bool keep_canonical = true;
			
 
				+			for (unsigned int j = 0; j < next_index; j++)
			
 
				+			{
			
 
				+				bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns +  j * BIT_PATTERN_WORDS);
			
 
				+				if (match)
			
 
				+				{
			
 
				+					keep_canonical = false;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (keep_useful && keep_canonical)
			
 
				+			{
			
 
				+				if (x == 0)
			
 
				+				{
			
 
				+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
			
 
				+					bsd.partitioning_count_selected[partition_count - 1]++;
			
 
				+					bsd.partitioning_count_all[partition_count - 1]++;
			
 
				+					build[i] = 1;
			
 
				+					next_index++;
			
 
				+				}
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				if (x == 1)
			
 
				+				{
			
 
				+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
			
 
				+					bsd.partitioning_count_all[partition_count - 1]++;
			
 
				+					next_index++;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void init_partition_tables(
			
 
				+	block_size_descriptor& bsd,
			
 
				+	bool can_omit_partitionings,
			
 
				+	unsigned int partition_count_cutoff
			
 
				+) {
			
 
				+	partition_info* par_tab2 = bsd.partitionings;
			
 
				+	partition_info* par_tab3 = par_tab2 + BLOCK_MAX_PARTITIONINGS;
			
 
				+	partition_info* par_tab4 = par_tab3 + BLOCK_MAX_PARTITIONINGS;
			
 
				+	partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS;
			
 
				+
			
 
				+	generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1);
			
 
				+	bsd.partitioning_count_selected[0] = 1;
			
 
				+	bsd.partitioning_count_all[0] = 1;
			
 
				+
			
 
				+	uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
			
 
				+
			
 
				+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
			
 
				+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
			
 
				+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
			
 
				+
			
 
				+	delete[] canonical_patterns;
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_percentile_tables.cpp
+++ b/thirdparty/astcenc/astcenc_percentile_tables.cpp
@@ -0,0 +1,1251 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Percentile data tables for different block encodings.
			
 
				+ *
			
 
				+ * To reduce binary size the tables are stored using a packed differential encoding.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+/**
			
 
				+ * @brief Structure containing packed percentile metadata.
			
 
				+ *
			
 
				+ * Note that percentile tables do not exist for 3D textures, so no zdim is stored.
			
 
				+ */
			
 
				+struct packed_percentile_table
			
 
				+{
			
 
				+	/** The block X dimension. */
			
 
				+	uint8_t xdim;
			
 
				+
			
 
				+	/** The block Y dimension. */
			
 
				+	uint8_t ydim;
			
 
				+
			
 
				+	/** The number of packed items in the 1 and 2 plane data. */
			
 
				+	uint16_t item_count[2];
			
 
				+
			
 
				+	/** The accumulator divisor for 1 and 2 plane data. */
			
 
				+	uint16_t difscales[2];
			
 
				+
			
 
				+	/** The initial accumulator values for 1 and 2 plane data. */
			
 
				+	uint16_t initial_percs[2];
			
 
				+
			
 
				+	/** The packed data for the 1 and 2 plane data. */
			
 
				+	const uint16_t *items[2];
			
 
				+};
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (4 * 4)
			
 
				+static const uint16_t percentile_arr_4x4_0[61] {
			
 
				+	0x0242, 0x7243, 0x6A51, 0x6A52, 0x5A41, 0x4A53, 0x8851, 0x3842,
			
 
				+	0x3852, 0x3853, 0x3043, 0xFA33, 0x1BDF, 0x2022, 0x1032, 0x29CE,
			
 
				+	0x21DE, 0x2823, 0x0813, 0x0A13, 0x0A31, 0x0A23, 0x09CF, 0x0833,
			
 
				+	0x0A32, 0x01DF, 0x0BDD, 0x0BCF, 0x0221, 0x095F, 0x0A01, 0x0BDE,
			
 
				+	0x0BCD, 0x0A22, 0x09AF, 0x0B5F, 0x0B4D, 0x0BCE, 0x0BBF, 0x0A11,
			
 
				+	0x01BF, 0x0202, 0x0B5D, 0x1203, 0x034E, 0x0B8E, 0x035E, 0x0212,
			
 
				+	0x032E, 0x0B4F, 0x03AF, 0x03AD, 0x03BD, 0x0BBE, 0x03AE, 0x039F,
			
 
				+	0x039E, 0x033E, 0x033F, 0x038F, 0x032F
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_4x4_1[84] {
			
 
				+	0x0452, 0xFFAE, 0x2433, 0x1DDF, 0x17CD, 0x1E21, 0x1C43, 0x1442,
			
 
				+	0x3FBE, 0x1FDD, 0x0E31, 0x0F4F, 0x1423, 0x0FBD, 0x1451, 0x0E03,
			
 
				+	0x05CF, 0x0C32, 0x0DDE, 0x27AD, 0x274E, 0x0E02, 0x0F5E, 0x07AF,
			
 
				+	0x0F5F, 0x0DCE, 0x0C41, 0x0422, 0x0613, 0x0E12, 0x0611, 0x0F3F,
			
 
				+	0x0601, 0x0DBF, 0x05DD, 0x075D, 0x0C02, 0x054E, 0x0431, 0x0413,
			
 
				+	0x079F, 0x05BE, 0x0F4D, 0x0403, 0x05AF, 0x055F, 0x05AE, 0x054F,
			
 
				+	0x0421, 0x05BD, 0x0DCD, 0x0411, 0x0412, 0x055E, 0x055D, 0x073D,
			
 
				+	0x058E, 0x072F, 0x072D, 0x079D, 0x0D2E, 0x0453, 0x078D, 0x053E,
			
 
				+	0x053F, 0x059E, 0x052F, 0x058F, 0x072E, 0x078F, 0x059F, 0x078E,
			
 
				+	0x071F, 0x073E, 0x051F, 0x070D, 0x079E, 0x070E, 0x071D, 0x0622,
			
 
				+	0x070F, 0x071E, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_4x4 {
			
 
				+	4, 4,
			
 
				+	{ 61, 84 },
			
 
				+	{ 184, 141 },
			
 
				+	{ 0, 53 },
			
 
				+	{ percentile_arr_4x4_0, percentile_arr_4x4_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 4)
			
 
				+static const uint16_t percentile_arr_5x4_0[91] {
			
 
				+	0x02C1, 0xFAD1, 0xE8D3, 0xDAC2, 0xA8D2, 0x70D1, 0x50C2, 0x80C3,
			
 
				+	0xD2C3, 0x4AA2, 0x2AD2, 0x2242, 0x2251, 0x42A3, 0x1A43, 0x4A52,
			
 
				+	0x32B3, 0x2A41, 0x1042, 0x1851, 0x5892, 0x10A2, 0x2253, 0x10B2,
			
 
				+	0x10B3, 0x13DF, 0x3083, 0x08B1, 0x1043, 0x12B1, 0x0AB2, 0x1A93,
			
 
				+	0x1852, 0x1A33, 0x09CE, 0x08A3, 0x1022, 0x1283, 0x0853, 0x1AA1,
			
 
				+	0x1093, 0x11DE, 0x135F, 0x1832, 0x195F, 0x0A81, 0x11CF, 0x0A31,
			
 
				+	0x09DF, 0x0B4D, 0x09AF, 0x03CF, 0x0813, 0x03DD, 0x0A92, 0x0A82,
			
 
				+	0x03CD, 0x0023, 0x0BDE, 0x0BBF, 0x1232, 0x0221, 0x0291, 0x0A23,
			
 
				+	0x0833, 0x035D, 0x0BCE, 0x01BF, 0x0222, 0x134E, 0x0213, 0x0A01,
			
 
				+	0x0B4F, 0x0B5E, 0x038E, 0x032E, 0x03AF, 0x0A11, 0x03AD, 0x0203,
			
 
				+	0x0202, 0x0BBD, 0x033E, 0x03AE, 0x03BE, 0x0212, 0x033F, 0x039E,
			
 
				+	0x039F, 0x032F, 0x038F
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_5x4_1[104] {
			
 
				+	0x0433, 0xB621, 0x5452, 0x4443, 0x7FAE, 0xFCA3, 0x7CC2, 0x24B2,
			
 
				+	0x45DF, 0x44B3, 0x7631, 0x27CD, 0x1CD1, 0x1E03, 0x4FBE, 0x774F,
			
 
				+	0x1C42, 0x7691, 0x24A2, 0x2681, 0x3C23, 0x3C93, 0x0FBD, 0x1C32,
			
 
				+	0x1E82, 0x1E12, 0x0F4E, 0x1602, 0x0FAD, 0x0C51, 0x1FDD, 0x0E13,
			
 
				+	0x0DCF, 0x175E, 0x0C22, 0x175F, 0x15DE, 0x0CB1, 0x17AF, 0x1CC1,
			
 
				+	0x1F3F, 0x1483, 0x0441, 0x0C91, 0x04D2, 0x0DCE, 0x154E, 0x079F,
			
 
				+	0x0CA1, 0x0F5D, 0x0431, 0x15DD, 0x05BF, 0x0C92, 0x0611, 0x0C82,
			
 
				+	0x0402, 0x074D, 0x0DBD, 0x055E, 0x05BE, 0x0DCD, 0x0421, 0x05AF,
			
 
				+	0x0403, 0x0D4F, 0x055F, 0x05AE, 0x0413, 0x0E01, 0x055D, 0x073D,
			
 
				+	0x0C12, 0x0692, 0x0411, 0x072D, 0x078D, 0x079D, 0x058E, 0x0D2E,
			
 
				+	0x0453, 0x072F, 0x059E, 0x052F, 0x071F, 0x053F, 0x053E, 0x078F,
			
 
				+	0x058F, 0x051F, 0x0F2E, 0x059F, 0x078E, 0x073E, 0x071D, 0x070D,
			
 
				+	0x070E, 0x079E, 0x0622, 0x0683, 0x070F, 0x071E, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_5x4 {
			
 
				+	5, 4,
			
 
				+	{ 91, 104 },
			
 
				+	{ 322, 464 },
			
 
				+	{ 0, 202 },
			
 
				+	{ percentile_arr_5x4_0, percentile_arr_5x4_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 5)
			
 
				+static const uint16_t percentile_arr_5x5_0[129] {
			
 
				+	0x00F3, 0xF8F2, 0x70E3, 0x62E1, 0x60E1, 0x4AC1, 0x3261, 0x38D3,
			
 
				+	0x3271, 0x5AF1, 0x5873, 0x2AD1, 0x28E2, 0x28F1, 0x2262, 0x9AC2,
			
 
				+	0x18D2, 0x1072, 0x1071, 0x22A2, 0x2062, 0x1A51, 0x10C2, 0x0892,
			
 
				+	0x08D1, 0x1AA3, 0x23EE, 0x08C3, 0x0BEF, 0x2242, 0x0863, 0x0AB3,
			
 
				+	0x0BFF, 0x0A93, 0x08A2, 0x0A41, 0x1083, 0x0842, 0x10B3, 0x21EE,
			
 
				+	0x10B2, 0x00B1, 0x1263, 0x12C3, 0x0A83, 0x0851, 0x11FE, 0x0253,
			
 
				+	0x09FD, 0x0A72, 0x09FF, 0x1AB2, 0x0BDF, 0x0A33, 0x0243, 0x0B7F,
			
 
				+	0x0AB1, 0x12D2, 0x0252, 0x096F, 0x00A3, 0x0893, 0x0822, 0x0843,
			
 
				+	0x097E, 0x097F, 0x01EF, 0x09CE, 0x03FE, 0x0A81, 0x036F, 0x0052,
			
 
				+	0x13FD, 0x0AA1, 0x1853, 0x036D, 0x0A92, 0x0832, 0x01DE, 0x0A82,
			
 
				+	0x0BED, 0x0231, 0x0BBF, 0x03DD, 0x0B6E, 0x01AF, 0x0813, 0x0023,
			
 
				+	0x0A91, 0x015F, 0x037E, 0x01CF, 0x0232, 0x0BCD, 0x0221, 0x0BDE,
			
 
				+	0x0213, 0x035F, 0x0B7D, 0x0223, 0x01BF, 0x0BCF, 0x01DF, 0x0033,
			
 
				+	0x0222, 0x03CE, 0x0A01, 0x03AF, 0x034D, 0x0B8E, 0x032E, 0x0203,
			
 
				+	0x0211, 0x0202, 0x0B5D, 0x03AD, 0x034E, 0x03AE, 0x034F, 0x033F,
			
 
				+	0x039F, 0x03BD, 0x03BE, 0x035E, 0x0212, 0x033E, 0x039E, 0x032F,
			
 
				+	0x038F
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_5x5_1[126] {
			
 
				+	0x0443, 0x6452, 0xFE21, 0x27AE, 0x2433, 0x1FCD, 0x25DF, 0x6CC2,
			
 
				+	0x2C62, 0x1F4F, 0x4C42, 0x1FBE, 0x0DEF, 0x34A3, 0x0E03, 0x54B2,
			
 
				+	0x1F7D, 0x17DD, 0x0DFF, 0x0CD1, 0x0E31, 0x0C71, 0x1CF1, 0x15FE,
			
 
				+	0x1691, 0x1681, 0x24B3, 0x174E, 0x0F6E, 0x0493, 0x175E, 0x1C51,
			
 
				+	0x17BD, 0x076D, 0x2CA2, 0x05EE, 0x1472, 0x2423, 0x0DCF, 0x0432,
			
 
				+	0x15DE, 0x0612, 0x0CD2, 0x0682, 0x0F5F, 0x07AD, 0x0602, 0x0CE1,
			
 
				+	0x0C91, 0x0FAF, 0x073F, 0x0E13, 0x0D7F, 0x0DCE, 0x0422, 0x0D7D,
			
 
				+	0x0441, 0x05FD, 0x0CB1, 0x0C83, 0x04C1, 0x0461, 0x0F9F, 0x0DDD,
			
 
				+	0x056E, 0x0C92, 0x0482, 0x0431, 0x05ED, 0x0D6F, 0x075D, 0x0402,
			
 
				+	0x057E, 0x0DBF, 0x04A1, 0x054E, 0x0F4D, 0x0403, 0x05CD, 0x0453,
			
 
				+	0x05AE, 0x0421, 0x0F1F, 0x05BE, 0x0601, 0x0611, 0x05BD, 0x05AF,
			
 
				+	0x078D, 0x072D, 0x073D, 0x055E, 0x0F9D, 0x0411, 0x0413, 0x0412,
			
 
				+	0x055F, 0x077E, 0x055D, 0x052E, 0x054F, 0x053E, 0x058E, 0x078F,
			
 
				+	0x059E, 0x071D, 0x0E92, 0x053F, 0x059F, 0x051F, 0x072F, 0x052F,
			
 
				+	0x070D, 0x079E, 0x058F, 0x072E, 0x070E, 0x078E, 0x070F, 0x073E,
			
 
				+	0x0622, 0x0683, 0x071E, 0x076F, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_5x5 {
			
 
				+	5, 5,
			
 
				+	{ 129, 126 },
			
 
				+	{ 258, 291 },
			
 
				+	{ 0, 116 },
			
 
				+	{ percentile_arr_5x5_0, percentile_arr_5x5_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 5)
			
 
				+static const uint16_t percentile_arr_6x5_0[165] {
			
 
				+	0x0163, 0xF8F3, 0x9962, 0x8972, 0x7961, 0x7173, 0x6953, 0x5943,
			
 
				+	0x4B41, 0x3AE1, 0x38E3, 0x6971, 0x32C1, 0x28D3, 0x2A61, 0xC8F2,
			
 
				+	0x2271, 0x4873, 0x5B21, 0x3AD1, 0x1B13, 0x1952, 0x1B51, 0x12F1,
			
 
				+	0x1A62, 0x1322, 0x1951, 0x10E2, 0x1B31, 0x20F1, 0x2102, 0x2072,
			
 
				+	0x10D2, 0x1142, 0x2912, 0x3871, 0x2BEE, 0x0862, 0x1123, 0x0AC2,
			
 
				+	0x12A2, 0x0A51, 0x1922, 0x0941, 0x1BEF, 0x0B42, 0x08D1, 0x13FF,
			
 
				+	0x1933, 0x08C3, 0x08C2, 0x1131, 0x08E1, 0x2903, 0x0863, 0x0B32,
			
 
				+	0x1132, 0x1AC3, 0x0A42, 0x1A41, 0x0042, 0x21EE, 0x09FF, 0x03DF,
			
 
				+	0x0AA3, 0x11FE, 0x02B3, 0x0B11, 0x10B3, 0x0B03, 0x11FD, 0x0913,
			
 
				+	0x0A53, 0x037F, 0x1263, 0x0051, 0x0A33, 0x0B01, 0x016F, 0x0A72,
			
 
				+	0x1312, 0x08A2, 0x10B1, 0x0BFE, 0x11EF, 0x0B02, 0x0A52, 0x0043,
			
 
				+	0x0822, 0x01CE, 0x0A43, 0x097F, 0x036F, 0x08B2, 0x03FD, 0x0A83,
			
 
				+	0x0B33, 0x0AB1, 0x017E, 0x0B23, 0x0852, 0x02D2, 0x0BBF, 0x0BDD,
			
 
				+	0x03ED, 0x0AB2, 0x02A1, 0x0853, 0x036D, 0x0892, 0x0032, 0x0A31,
			
 
				+	0x0083, 0x09DE, 0x0A93, 0x08A3, 0x1213, 0x0BDE, 0x03CD, 0x036E,
			
 
				+	0x037E, 0x0A21, 0x0023, 0x0BCF, 0x01CF, 0x0013, 0x01AF, 0x0A92,
			
 
				+	0x0232, 0x035F, 0x0093, 0x0B7D, 0x015F, 0x0282, 0x01BF, 0x09DF,
			
 
				+	0x03CE, 0x0223, 0x0833, 0x0222, 0x03AF, 0x0A01, 0x0291, 0x0B4D,
			
 
				+	0x032E, 0x038E, 0x0203, 0x0281, 0x035D, 0x03AD, 0x0B9F, 0x0202,
			
 
				+	0x034F, 0x03BE, 0x0211, 0x03AE, 0x03BD, 0x0212, 0x034E, 0x033F,
			
 
				+	0x033E, 0x035E, 0x039E, 0x032F, 0x038F
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_6x5_1[145] {
			
 
				+	0x0443, 0xEFAE, 0x2CC2, 0x2E21, 0x2C52, 0x7C33, 0x47CD, 0x25DF,
			
 
				+	0x3CA3, 0xFFBE, 0x2551, 0x24B3, 0x474F, 0x1513, 0x2691, 0x1603,
			
 
				+	0x1462, 0x1D32, 0x14B2, 0x5442, 0x2CD2, 0x35EF, 0x0CD1, 0x3D22,
			
 
				+	0x17BD, 0x0FDD, 0x0DFF, 0x2631, 0x177D, 0x0CF1, 0x1E81, 0x0E82,
			
 
				+	0x1DFE, 0x0F5E, 0x0701, 0x2CA2, 0x1D03, 0x0F4E, 0x1471, 0x0C51,
			
 
				+	0x1F6E, 0x2FAF, 0x0561, 0x0C72, 0x176D, 0x0FAD, 0x0DEE, 0x05CF,
			
 
				+	0x0E13, 0x0F5F, 0x0E12, 0x0C23, 0x1E02, 0x1D12, 0x0CB1, 0x0C32,
			
 
				+	0x0C93, 0x15DE, 0x0F9F, 0x0F3F, 0x0D41, 0x0C41, 0x0CC1, 0x0D31,
			
 
				+	0x0C22, 0x05FD, 0x057F, 0x0D01, 0x0461, 0x04E1, 0x0D7D, 0x05CE,
			
 
				+	0x0502, 0x0C31, 0x05ED, 0x05DD, 0x0511, 0x0F11, 0x0491, 0x0D6F,
			
 
				+	0x0521, 0x056E, 0x0C83, 0x0D23, 0x04A1, 0x0C02, 0x075D, 0x05BF,
			
 
				+	0x0C21, 0x079D, 0x0482, 0x05BD, 0x0DBE, 0x05CD, 0x054E, 0x057E,
			
 
				+	0x0DAE, 0x074D, 0x078D, 0x0542, 0x0492, 0x05AF, 0x0611, 0x0F3D,
			
 
				+	0x0601, 0x071F, 0x055E, 0x059E, 0x0571, 0x054F, 0x0412, 0x0453,
			
 
				+	0x058E, 0x0413, 0x0D3E, 0x077E, 0x072D, 0x052E, 0x059F, 0x055D,
			
 
				+	0x072F, 0x0403, 0x0411, 0x058F, 0x055F, 0x0692, 0x078E, 0x053F,
			
 
				+	0x0D2F, 0x078F, 0x070D, 0x071D, 0x051F, 0x072E, 0x079E, 0x070E,
			
 
				+	0x070F, 0x073E, 0x0622, 0x0683, 0x0702, 0x071E, 0x076F, 0x07BF,
			
 
				+	0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_6x5 {
			
 
				+	6, 5,
			
 
				+	{ 165, 145 },
			
 
				+	{ 388, 405 },
			
 
				+	{ 0, 156 },
			
 
				+	{ percentile_arr_6x5_0, percentile_arr_6x5_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 6)
			
 
				+static const uint16_t percentile_arr_6x6_0[206] {
			
 
				+	0x006F, 0xF908, 0xF104, 0xE918, 0xE963, 0xD114, 0xB0F3, 0xA07E,
			
 
				+	0x7972, 0x705F, 0x687F, 0x6162, 0x5953, 0x586E, 0x610C, 0x524D,
			
 
				+	0x5973, 0x9943, 0x98E3, 0x904F, 0x8341, 0x7AC1, 0x3A61, 0x70D3,
			
 
				+	0xA073, 0x6AE1, 0x30F2, 0x3313, 0x2B21, 0x9A2E, 0x4322, 0x225D,
			
 
				+	0x2331, 0x2271, 0x22D1, 0x1A2D, 0x221F, 0x22F1, 0x1971, 0x6952,
			
 
				+	0x1951, 0x187D, 0x18F1, 0x1902, 0x185E, 0x1B51, 0x105D, 0x1A3D,
			
 
				+	0x30E2, 0x10D2, 0x1961, 0x12A2, 0x6072, 0x3942, 0x386D, 0x33EE,
			
 
				+	0x104E, 0x4923, 0x101E, 0x2122, 0x1251, 0x1141, 0x182F, 0x3133,
			
 
				+	0x080E, 0x1262, 0x123E, 0x1B32, 0x102E, 0x1931, 0x10D1, 0x1912,
			
 
				+	0x0871, 0x12C2, 0x08C2, 0x1103, 0x0B03, 0x1062, 0x083D, 0x08E1,
			
 
				+	0x1132, 0x184D, 0x0863, 0x08C3, 0x303F, 0x083E, 0x10B3, 0x12A3,
			
 
				+	0x0BEF, 0x0B11, 0x1A42, 0x2233, 0x13FF, 0x080F, 0x0A41, 0x0AC3,
			
 
				+	0x0842, 0x1A63, 0x0BDF, 0x09FF, 0x12B3, 0x124E, 0x0B12, 0x0B42,
			
 
				+	0x0A2F, 0x1253, 0x0913, 0x1051, 0x0B01, 0x120F, 0x0B02, 0x08A2,
			
 
				+	0x0BBF, 0x00B1, 0x22B1, 0x01EE, 0x1B33, 0x0B23, 0x0283, 0x13FD,
			
 
				+	0x0AB2, 0x11FD, 0x09FE, 0x0A43, 0x08B2, 0x0A1D, 0x0A52, 0x023F,
			
 
				+	0x101F, 0x01CE, 0x0A31, 0x0BDD, 0x0293, 0x1822, 0x12A1, 0x03FE,
			
 
				+	0x121E, 0x0843, 0x0272, 0x0B6F, 0x0052, 0x0A0D, 0x0BED, 0x12D2,
			
 
				+	0x1B7F, 0x1053, 0x0032, 0x01DE, 0x08A3, 0x020E, 0x0883, 0x09EF,
			
 
				+	0x0892, 0x0A21, 0x03CD, 0x0B5F, 0x0213, 0x0A32, 0x016F, 0x1292,
			
 
				+	0x03DE, 0x017E, 0x0BAF, 0x0223, 0x1093, 0x0BCF, 0x037E, 0x01DF,
			
 
				+	0x09CF, 0x015F, 0x09AF, 0x0023, 0x01BF, 0x0222, 0x0282, 0x03CE,
			
 
				+	0x1013, 0x036E, 0x097F, 0x0033, 0x0A01, 0x0B6D, 0x03BE, 0x037D,
			
 
				+	0x0281, 0x0BAE, 0x0203, 0x032E, 0x034D, 0x034F, 0x0291, 0x0211,
			
 
				+	0x038E, 0x03BD, 0x039E, 0x0BAD, 0x033E, 0x034E, 0x039F, 0x0202,
			
 
				+	0x035D, 0x0212, 0x033F, 0x035E, 0x038F, 0x032F
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_6x6_1[164] {
			
 
				+	0x07AE, 0x8443, 0x7E21, 0x77CD, 0x6C62, 0x9433, 0x6452, 0x34C2,
			
 
				+	0x5DDF, 0xC7BE, 0x25EF, 0x24A3, 0x3CF1, 0xFDFF, 0x177D, 0x1F4F,
			
 
				+	0xC551, 0x5CB3, 0x1532, 0x1513, 0x143E, 0x245D, 0x14B2, 0x2472,
			
 
				+	0x14D2, 0x1FBD, 0x1631, 0x2DFE, 0x1691, 0x17DD, 0x2E03, 0x376E,
			
 
				+	0x2442, 0x0F6D, 0x3C71, 0x2CD1, 0x2522, 0x6C51, 0x260D, 0x17AF,
			
 
				+	0x0DEE, 0x1C1F, 0x2F01, 0x142E, 0x0CA2, 0x0FAD, 0x3D03, 0x275E,
			
 
				+	0x1681, 0x274E, 0x1682, 0x1C23, 0x273F, 0x0F5F, 0x05DE, 0x15FD,
			
 
				+	0x0DCF, 0x1E02, 0x04B1, 0x144D, 0x0E12, 0x0D12, 0x1CC1, 0x0E13,
			
 
				+	0x1C6D, 0x0C32, 0x043D, 0x0C61, 0x0F9F, 0x04E1, 0x0DCE, 0x0D41,
			
 
				+	0x1C93, 0x0C22, 0x061D, 0x0D7F, 0x0C41, 0x0561, 0x0531, 0x0D21,
			
 
				+	0x0711, 0x0C91, 0x0501, 0x0C1E, 0x040F, 0x15DD, 0x0431, 0x0C2F,
			
 
				+	0x057D, 0x0C2D, 0x0DBE, 0x040E, 0x0D02, 0x0D11, 0x054E, 0x040D,
			
 
				+	0x0D23, 0x0DBF, 0x04A1, 0x05ED, 0x0C1D, 0x05BD, 0x072D, 0x056E,
			
 
				+	0x0483, 0x0F3D, 0x0482, 0x078D, 0x0F5D, 0x0453, 0x0D9E, 0x0C4E,
			
 
				+	0x05CD, 0x079D, 0x0402, 0x05AE, 0x0F1F, 0x0542, 0x074D, 0x056F,
			
 
				+	0x0421, 0x0D4F, 0x0601, 0x0571, 0x0492, 0x059F, 0x053F, 0x05AF,
			
 
				+	0x0611, 0x055E, 0x0D8E, 0x053E, 0x055D, 0x047D, 0x0411, 0x052E,
			
 
				+	0x058F, 0x051F, 0x055F, 0x0D7E, 0x072F, 0x052F, 0x0412, 0x078F,
			
 
				+	0x0403, 0x077E, 0x070D, 0x070E, 0x078E, 0x0F1D, 0x072E, 0x0413,
			
 
				+	0x070F, 0x0692, 0x079E, 0x060E, 0x0622, 0x0683, 0x0702, 0x071E,
			
 
				+	0x073E, 0x076F, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_6x6 {
			
 
				+	6, 6,
			
 
				+	{ 206, 164 },
			
 
				+	{ 769, 644 },
			
 
				+	{ 0, 256 },
			
 
				+	{ percentile_arr_6x6_0, percentile_arr_6x6_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 5)
			
 
				+static const uint16_t percentile_arr_8x5_0[226] {
			
 
				+	0x0066, 0xF865, 0xE963, 0xA856, 0xA1F2, 0x9875, 0x91C3, 0x91E2,
			
 
				+	0x80F3, 0x8076, 0x61E3, 0x6153, 0x5172, 0x59D2, 0x51D3, 0x5047,
			
 
				+	0xA943, 0x49B3, 0x4846, 0x4962, 0xC037, 0x4173, 0x39F1, 0x7027,
			
 
				+	0xA2C1, 0x3AE1, 0x9341, 0x30D3, 0x5225, 0x2A61, 0x33C1, 0x28E3,
			
 
				+	0x53A1, 0x49C2, 0x2A06, 0x4055, 0x2006, 0x21D1, 0x2271, 0x4321,
			
 
				+	0x3873, 0x18F2, 0x2015, 0x1A15, 0x1857, 0x52D1, 0x3045, 0x4835,
			
 
				+	0x1952, 0x29E1, 0x3207, 0x1036, 0x1816, 0x2A16, 0x2971, 0x13B1,
			
 
				+	0x2A17, 0x2351, 0x1025, 0x1826, 0x30E2, 0x1262, 0x20F1, 0x1007,
			
 
				+	0x1072, 0x1151, 0x10D2, 0x1235, 0x1205, 0x1062, 0x4AF1, 0x1251,
			
 
				+	0x0B31, 0x1381, 0x13EE, 0x1B92, 0x13EF, 0x0942, 0x1AA2, 0x13FF,
			
 
				+	0x1161, 0x0B93, 0x19A2, 0x11B1, 0x08D1, 0x12C2, 0x0B13, 0x1B22,
			
 
				+	0x2123, 0x09A3, 0x2071, 0x1B7F, 0x1817, 0x0A42, 0x10C2, 0x1233,
			
 
				+	0x08C3, 0x0A41, 0x0B42, 0x09C1, 0x0933, 0x1AB3, 0x1382, 0x1BDF,
			
 
				+	0x2122, 0x0A53, 0x0AC3, 0x20E1, 0x0941, 0x0931, 0x0042, 0x0BA2,
			
 
				+	0x0AA3, 0x0992, 0x0863, 0x08B3, 0x11B2, 0x0902, 0x1283, 0x09FF,
			
 
				+	0x0B83, 0x0982, 0x0932, 0x0BFE, 0x0B32, 0x0BBF, 0x11FE, 0x036F,
			
 
				+	0x0851, 0x08B1, 0x18A2, 0x11EE, 0x0A52, 0x0BB2, 0x01FD, 0x0A43,
			
 
				+	0x1A63, 0x1193, 0x0B91, 0x0043, 0x1231, 0x0A26, 0x0AB1, 0x03FD,
			
 
				+	0x096F, 0x00B2, 0x0983, 0x0A72, 0x01CE, 0x0BDD, 0x0022, 0x0B11,
			
 
				+	0x1213, 0x0B6D, 0x017E, 0x1333, 0x0112, 0x0852, 0x02D2, 0x097F,
			
 
				+	0x01EF, 0x0AB2, 0x0293, 0x0853, 0x0BED, 0x0B12, 0x1303, 0x02A1,
			
 
				+	0x0892, 0x0032, 0x0883, 0x0B6E, 0x0292, 0x0A32, 0x037E, 0x0B23,
			
 
				+	0x0103, 0x0A21, 0x0B01, 0x0302, 0x0BCD, 0x00A3, 0x0BCF, 0x0BDE,
			
 
				+	0x0113, 0x01DE, 0x0B5F, 0x0013, 0x0BAF, 0x0223, 0x0222, 0x0A82,
			
 
				+	0x0833, 0x0023, 0x09CF, 0x037D, 0x01AF, 0x095F, 0x03CE, 0x09DF,
			
 
				+	0x01BF, 0x0893, 0x0203, 0x0201, 0x0B4D, 0x03BE, 0x032E, 0x03AE,
			
 
				+	0x0291, 0x0A02, 0x0211, 0x039F, 0x0281, 0x038E, 0x03AD, 0x033F,
			
 
				+	0x035D, 0x033E, 0x034E, 0x034F, 0x0212, 0x03BD, 0x032F, 0x035E,
			
 
				+	0x038F, 0x039E
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_8x5_1[167] {
			
 
				+	0x0621, 0xFCC2, 0x3443, 0xA433, 0x5532, 0x2551, 0x6CA3, 0x27AE,
			
 
				+	0x6452, 0x8E03, 0x3CB3, 0x4DA2, 0x6DDF, 0x37CD, 0x6F01, 0x1691,
			
 
				+	0x2E82, 0x27BE, 0x1513, 0x34D2, 0x1D22, 0x3E31, 0x2593, 0x2CB2,
			
 
				+	0x1C16, 0x374F, 0x0DD1, 0x2583, 0x6613, 0x0CD1, 0x0C35, 0x1462,
			
 
				+	0x3E81, 0x2612, 0x2C42, 0x3407, 0x14A2, 0x0E02, 0x1CF1, 0x0C06,
			
 
				+	0x17BD, 0x0F7D, 0x1D23, 0x35B1, 0x179F, 0x0D92, 0x0F5E, 0x1451,
			
 
				+	0x04B1, 0x1F6E, 0x0DEF, 0x0D31, 0x374E, 0x15C1, 0x0541, 0x2405,
			
 
				+	0x17AD, 0x0471, 0x1472, 0x0DFE, 0x0711, 0x0FDD, 0x0DFF, 0x0432,
			
 
				+	0x1D82, 0x0423, 0x0F6D, 0x07AF, 0x0F5F, 0x04C1, 0x1542, 0x0561,
			
 
				+	0x0DCF, 0x1D03, 0x1493, 0x0422, 0x0445, 0x0D12, 0x0C25, 0x0415,
			
 
				+	0x0DA1, 0x1591, 0x0DEE, 0x05DE, 0x0C31, 0x0491, 0x0441, 0x0D21,
			
 
				+	0x078D, 0x057D, 0x0C61, 0x0F3F, 0x0581, 0x0D6E, 0x0501, 0x0CA1,
			
 
				+	0x04E1, 0x0DFD, 0x057F, 0x0502, 0x0511, 0x0C82, 0x0483, 0x0C03,
			
 
				+	0x079D, 0x0402, 0x0DDD, 0x0611, 0x05AE, 0x0DCE, 0x056F, 0x0421,
			
 
				+	0x057E, 0x071F, 0x0DBF, 0x05BE, 0x0412, 0x059F, 0x054E, 0x077E,
			
 
				+	0x0C26, 0x05ED, 0x073D, 0x0601, 0x0492, 0x0453, 0x075D, 0x058E,
			
 
				+	0x0F2D, 0x05CD, 0x0571, 0x053E, 0x0692, 0x05BD, 0x054F, 0x055E,
			
 
				+	0x0411, 0x0F1D, 0x074D, 0x059E, 0x05AF, 0x070D, 0x053F, 0x058F,
			
 
				+	0x0413, 0x070F, 0x055D, 0x070E, 0x078F, 0x052E, 0x072F, 0x055F,
			
 
				+	0x078E, 0x0F2E, 0x052F, 0x051F, 0x0417, 0x071E, 0x0781, 0x0622,
			
 
				+	0x0683, 0x0702, 0x073E, 0x076F, 0x079E, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_8x5 {
			
 
				+	8, 5,
			
 
				+	{ 226, 167 },
			
 
				+	{ 763, 517 },
			
 
				+	{ 0, 178 },
			
 
				+	{ percentile_arr_8x5_0, percentile_arr_8x5_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 6)
			
 
				+static const uint16_t percentile_arr_8x6_0[273] {
			
 
				+	0x0154, 0xF944, 0xE066, 0xA128, 0x9963, 0x8118, 0x806F, 0x79F2,
			
 
				+	0x79E2, 0x7108, 0xD934, 0x6056, 0x69C3, 0x60F3, 0x5972, 0x59E3,
			
 
				+	0x5075, 0x91B3, 0xC9D2, 0x807E, 0x385F, 0x4153, 0x3943, 0x4162,
			
 
				+	0x3837, 0x3847, 0x7173, 0x31D3, 0x6948, 0x3046, 0x307F, 0x5827,
			
 
				+	0x3114, 0x32C1, 0x3076, 0x2A4D, 0x58E3, 0x306E, 0x2924, 0x2A61,
			
 
				+	0x29F1, 0x50D3, 0x704F, 0x210C, 0x2BA1, 0x2225, 0x2873, 0x4865,
			
 
				+	0x2206, 0x8341, 0x2006, 0x3B21, 0x18F2, 0x21C2, 0x1A1F, 0x23C1,
			
 
				+	0x3AE1, 0x1855, 0x19D1, 0x1A15, 0x3815, 0x1207, 0x1835, 0x2A2E,
			
 
				+	0x1A16, 0x1836, 0x2271, 0x2845, 0x1A2D, 0x11E1, 0x1816, 0x1171,
			
 
				+	0x2217, 0x1952, 0x12D1, 0x3904, 0x125D, 0x4BB1, 0x207D, 0x10E2,
			
 
				+	0x1026, 0x2025, 0x12F1, 0x28F1, 0x105D, 0x1235, 0x12A2, 0x1007,
			
 
				+	0x123D, 0x1A05, 0x1072, 0x1331, 0x101E, 0x0951, 0x10D2, 0x1057,
			
 
				+	0x1B92, 0x185E, 0x1251, 0x19A2, 0x186D, 0x0B81, 0x2BEE, 0x080E,
			
 
				+	0x1A33, 0x1942, 0x0B13, 0x0B51, 0x11A3, 0x0923, 0x2322, 0x09B1,
			
 
				+	0x184E, 0x1161, 0x18D1, 0x0933, 0x0B93, 0x4A62, 0x1017, 0x082F,
			
 
				+	0x0A42, 0x0B82, 0x0AA3, 0x0A41, 0x08C2, 0x08B3, 0x0A3E, 0x22B3,
			
 
				+	0x0871, 0x1BBF, 0x09C1, 0x0AC2, 0x09B2, 0x0BEF, 0x082E, 0x1062,
			
 
				+	0x0922, 0x08C3, 0x1063, 0x0A53, 0x0BDF, 0x080F, 0x0B42, 0x0A83,
			
 
				+	0x084D, 0x103F, 0x0931, 0x08E1, 0x0A0F, 0x1BA2, 0x09FF, 0x1332,
			
 
				+	0x03FF, 0x0941, 0x12C3, 0x0A63, 0x003D, 0x0842, 0x083E, 0x0B83,
			
 
				+	0x0BB2, 0x0A31, 0x0932, 0x1102, 0x0992, 0x0982, 0x1051, 0x08B1,
			
 
				+	0x0A2F, 0x121E, 0x02B1, 0x0A4E, 0x11EE, 0x00A2, 0x1022, 0x0043,
			
 
				+	0x0A52, 0x0A1D, 0x0226, 0x1193, 0x03DD, 0x08B2, 0x0BFD, 0x0A43,
			
 
				+	0x0A13, 0x0AB2, 0x01FD, 0x09FE, 0x020D, 0x081F, 0x0B33, 0x0053,
			
 
				+	0x0B91, 0x0293, 0x0B11, 0x0B7F, 0x0AA1, 0x0B03, 0x0A0E, 0x03FE,
			
 
				+	0x01CE, 0x0B6F, 0x0183, 0x0912, 0x023F, 0x0852, 0x0A21, 0x0323,
			
 
				+	0x03ED, 0x0A32, 0x13AF, 0x0272, 0x08A3, 0x0B12, 0x0083, 0x0832,
			
 
				+	0x13CD, 0x0223, 0x0A92, 0x0092, 0x0AD2, 0x0301, 0x0302, 0x0BDE,
			
 
				+	0x0A22, 0x01EF, 0x0B5F, 0x0103, 0x0BCF, 0x096F, 0x017E, 0x0113,
			
 
				+	0x01DE, 0x0823, 0x0282, 0x0B6E, 0x015F, 0x0813, 0x01AF, 0x01CF,
			
 
				+	0x0B7E, 0x0033, 0x01DF, 0x0BCE, 0x01BF, 0x036D, 0x0A03, 0x017F,
			
 
				+	0x03BE, 0x0201, 0x0893, 0x038E, 0x034D, 0x03AE, 0x0202, 0x039F,
			
 
				+	0x0291, 0x0A11, 0x032E, 0x033F, 0x034F, 0x0281, 0x037D, 0x03BD,
			
 
				+	0x0212, 0x033E, 0x035E, 0x034E, 0x035D, 0x03AD, 0x032F, 0x038F,
			
 
				+	0x039E
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_8x6_1[186] {
			
 
				+	0x0621, 0xFC33, 0x37AE, 0x1CC2, 0x2C43, 0xAD32, 0x34A3, 0x4551,
			
 
				+	0x6452, 0x5C62, 0x1FCD, 0x14F1, 0x4CB3, 0x24D2, 0x15DF, 0x0FBE,
			
 
				+	0x2603, 0x3DA2, 0x2E31, 0x25D1, 0x25EF, 0x0D22, 0x2E91, 0x1E82,
			
 
				+	0x0FBD, 0x1513, 0x0CB2, 0x0CD1, 0x0F4F, 0x1F7D, 0x1701, 0x0C16,
			
 
				+	0x2593, 0x2C42, 0x0C72, 0x14A2, 0x0F6E, 0x0C35, 0x0C71, 0x0D83,
			
 
				+	0x0C07, 0x1DFF, 0x043E, 0x1613, 0x07DD, 0x0FAD, 0x1451, 0x076D,
			
 
				+	0x0E81, 0x05FE, 0x0406, 0x0E0D, 0x045D, 0x2612, 0x0E02, 0x07AF,
			
 
				+	0x0DB1, 0x0F5E, 0x15C1, 0x0C23, 0x1523, 0x0C1F, 0x0D92, 0x04B1,
			
 
				+	0x0D31, 0x0432, 0x0D61, 0x0F4E, 0x0D41, 0x0DEE, 0x0D42, 0x04C1,
			
 
				+	0x0CE1, 0x079F, 0x0C2E, 0x0405, 0x0C22, 0x0461, 0x0E1D, 0x0582,
			
 
				+	0x073F, 0x0571, 0x0C4D, 0x0DFD, 0x05CE, 0x0C6D, 0x05DE, 0x0415,
			
 
				+	0x0C45, 0x075F, 0x0C41, 0x0D03, 0x05A1, 0x0711, 0x05CF, 0x0425,
			
 
				+	0x0C93, 0x0D21, 0x0591, 0x043D, 0x0D12, 0x0501, 0x040F, 0x0511,
			
 
				+	0x0431, 0x0C03, 0x04A1, 0x078D, 0x0581, 0x041E, 0x040D, 0x0C02,
			
 
				+	0x040E, 0x05DD, 0x057F, 0x079D, 0x042D, 0x0D9F, 0x0502, 0x056E,
			
 
				+	0x0412, 0x071F, 0x044E, 0x05BF, 0x0C1D, 0x0482, 0x05AE, 0x042F,
			
 
				+	0x057D, 0x0491, 0x054E, 0x047D, 0x0DBE, 0x0611, 0x0492, 0x0601,
			
 
				+	0x05BD, 0x05CD, 0x0426, 0x05ED, 0x072D, 0x073D, 0x0483, 0x0F5D,
			
 
				+	0x0421, 0x056F, 0x053F, 0x058E, 0x054F, 0x078F, 0x053E, 0x059E,
			
 
				+	0x057E, 0x051F, 0x055D, 0x0413, 0x070D, 0x05AF, 0x0411, 0x0453,
			
 
				+	0x0D5E, 0x077E, 0x052F, 0x070F, 0x074D, 0x0692, 0x070E, 0x072F,
			
 
				+	0x072E, 0x058F, 0x071D, 0x052E, 0x0417, 0x073E, 0x0781, 0x078E,
			
 
				+	0x055F, 0x060E, 0x0622, 0x0683, 0x0702, 0x071E, 0x076F, 0x079E,
			
 
				+	0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_8x6 {
			
 
				+	8, 6,
			
 
				+	{ 273, 186 },
			
 
				+	{ 880, 300 },
			
 
				+	{ 0, 64 },
			
 
				+	{ percentile_arr_8x6_0, percentile_arr_8x6_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 8)
			
 
				+static const uint16_t percentile_arr_8x8_0[347] {
			
 
				+	0x0334, 0xFD44, 0xDD14, 0x9154, 0x9B08, 0x906A, 0x8928, 0x8108,
			
 
				+	0xE866, 0xC918, 0x606F, 0xC0FE, 0x5963, 0x58EE, 0x6534, 0x505A,
			
 
				+	0x51E2, 0xA8CF, 0x5354, 0x5314, 0x5134, 0x5524, 0x48F3, 0x504B,
			
 
				+	0x487E, 0x5344, 0x49C3, 0x4972, 0x49F2, 0x4856, 0xD0EF, 0x81D2,
			
 
				+	0x78DE, 0x4261, 0x3AC1, 0x71E3, 0x6879, 0x390C, 0x3143, 0x31B3,
			
 
				+	0x385F, 0x3153, 0x306E, 0x3037, 0x30DF, 0x3162, 0x304F, 0x3075,
			
 
				+	0xB03B, 0x2847, 0x28E3, 0x2914, 0x507F, 0x28BF, 0x5173, 0x5073,
			
 
				+	0x20D3, 0x2A06, 0x2827, 0x2508, 0x2229, 0x29D3, 0x204A, 0x207A,
			
 
				+	0x2046, 0x4148, 0x20FD, 0x4225, 0x23A1, 0x3944, 0x2065, 0x1924,
			
 
				+	0x2324, 0x1806, 0x19F1, 0x2215, 0x1876, 0x22AD, 0x502B, 0x1B04,
			
 
				+	0x18F2, 0x3A4D, 0x3216, 0x3504, 0x18DD, 0x1B21, 0x10CE, 0x1869,
			
 
				+	0x1B41, 0x1855, 0x1207, 0x1AE1, 0x2845, 0x19D1, 0x2A0A, 0x1A2D,
			
 
				+	0x2A1A, 0x11C2, 0x1A0B, 0x1217, 0x2816, 0x121B, 0x1271, 0x2AD1,
			
 
				+	0x1035, 0x1015, 0x287D, 0x12F1, 0x43C1, 0x1171, 0x1A05, 0x08E2,
			
 
				+	0x11E1, 0x3251, 0x2049, 0x20F1, 0x12CD, 0x0A39, 0x1219, 0x1059,
			
 
				+	0x1104, 0x1036, 0x1872, 0x3007, 0x08ED, 0x205E, 0x1026, 0x0952,
			
 
				+	0x1392, 0x1019, 0x0951, 0x100A, 0x13EE, 0x08D2, 0x1242, 0x0ABD,
			
 
				+	0x22A2, 0x0BDF, 0x2B81, 0x0A35, 0x13B1, 0x0839, 0x13BF, 0x0A33,
			
 
				+	0x1B31, 0x205D, 0x1241, 0x183A, 0x2025, 0x0B93, 0x0A3D, 0x1017,
			
 
				+	0x1313, 0x1253, 0x082A, 0x204E, 0x09A2, 0x080B, 0x0A1F, 0x125D,
			
 
				+	0x0A2E, 0x081A, 0x08D1, 0x082F, 0x086D, 0x1B82, 0x0A09, 0x0B22,
			
 
				+	0x1062, 0x11A3, 0x2161, 0x0923, 0x129F, 0x1A62, 0x0871, 0x0942,
			
 
				+	0x081B, 0x1133, 0x18AE, 0x0A9E, 0x0863, 0x09FF, 0x18C2, 0x0B51,
			
 
				+	0x08BD, 0x0AA3, 0x09B1, 0x1AC2, 0x08B3, 0x0829, 0x0BEF, 0x0B83,
			
 
				+	0x0AAE, 0x0A8D, 0x1857, 0x185B, 0x08AF, 0x103F, 0x08C3, 0x09B2,
			
 
				+	0x0A4E, 0x11C1, 0x0A31, 0x0B42, 0x0A83, 0x0BFF, 0x13DD, 0x00CD,
			
 
				+	0x0AB3, 0x0842, 0x08BE, 0x0922, 0x1A8E, 0x08E1, 0x002E, 0x0BA2,
			
 
				+	0x0A8F, 0x2263, 0x0252, 0x0B32, 0x0AC3, 0x0941, 0x0A43, 0x083D,
			
 
				+	0x083E, 0x0A3E, 0x084D, 0x1131, 0x136F, 0x0AB1, 0x0193, 0x0BFD,
			
 
				+	0x0391, 0x0851, 0x13AF, 0x0843, 0x0213, 0x1226, 0x0932, 0x03B2,
			
 
				+	0x0902, 0x0BCD, 0x0221, 0x089E, 0x00B1, 0x0BDE, 0x03FE, 0x02A1,
			
 
				+	0x0982, 0x009F, 0x080E, 0x0B5F, 0x02BE, 0x0A32, 0x0A2A, 0x01EE,
			
 
				+	0x0053, 0x0AB2, 0x0192, 0x09FD, 0x0052, 0x0B03, 0x0293, 0x00A2,
			
 
				+	0x0B7F, 0x0BED, 0x0311, 0x08B2, 0x0A72, 0x088E, 0x0333, 0x0B12,
			
 
				+	0x0A23, 0x0822, 0x0083, 0x11CE, 0x021D, 0x08A3, 0x088F, 0x029D,
			
 
				+	0x0A22, 0x0A3F, 0x01FE, 0x020F, 0x0983, 0x02D2, 0x0292, 0x0B23,
			
 
				+	0x001E, 0x0BCF, 0x03CE, 0x09AF, 0x0B02, 0x0301, 0x022F, 0x137E,
			
 
				+	0x021E, 0x09EF, 0x016F, 0x0112, 0x097E, 0x080F, 0x020D, 0x0092,
			
 
				+	0x01DE, 0x09DF, 0x0032, 0x0033, 0x0A82, 0x03BE, 0x0B6E, 0x001F,
			
 
				+	0x020E, 0x0023, 0x09CF, 0x0113, 0x0103, 0x0013, 0x0BAE, 0x0203,
			
 
				+	0x0BAD, 0x01BF, 0x034F, 0x095F, 0x036D, 0x0202, 0x017F, 0x0093,
			
 
				+	0x0201, 0x034D, 0x0212, 0x035D, 0x03BD, 0x0B3F, 0x035E, 0x0211,
			
 
				+	0x0281, 0x0291, 0x032E, 0x037D, 0x034E, 0x038E, 0x039F, 0x032F,
			
 
				+	0x033E, 0x038F, 0x039E
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_8x8_1[208] {
			
 
				+	0x0621, 0x3443, 0x47CD, 0x97AE, 0xFC62, 0x14F1, 0x24C2, 0x25DF,
			
 
				+	0x3C33, 0x1C52, 0x9C72, 0x0FBE, 0x0C5D, 0x343E, 0x24A3, 0x1551,
			
 
				+	0x5D32, 0x1CD2, 0x15EF, 0x4E31, 0x04DD, 0x1FDD, 0x174F, 0x0DD1,
			
 
				+	0x3E0D, 0x15FF, 0x0DA2, 0x1E03, 0x17BD, 0x177D, 0x14B3, 0x0471,
			
 
				+	0x0CAE, 0x1C1F, 0x04D1, 0x0F6E, 0x0DFE, 0x1C42, 0x0C16, 0x0D22,
			
 
				+	0x0C9F, 0x2C2E, 0x0FAD, 0x0571, 0x147D, 0x0C07, 0x04B2, 0x0F6D,
			
 
				+	0x0F5E, 0x07AF, 0x146D, 0x0C51, 0x0593, 0x2583, 0x0C4E, 0x040B,
			
 
				+	0x0C35, 0x0513, 0x0E91, 0x0406, 0x073F, 0x144D, 0x0561, 0x048F,
			
 
				+	0x0F01, 0x0F4E, 0x0CA2, 0x075F, 0x1682, 0x04E1, 0x0C1A, 0x04BD,
			
 
				+	0x0542, 0x0D41, 0x0DEE, 0x04CD, 0x0DCF, 0x04B1, 0x0C15, 0x0C3D,
			
 
				+	0x0423, 0x0592, 0x0DDE, 0x0422, 0x0432, 0x05FD, 0x0DC1, 0x05B1,
			
 
				+	0x0DCE, 0x0612, 0x0C2F, 0x0445, 0x0602, 0x0531, 0x0439, 0x0E81,
			
 
				+	0x0582, 0x0C61, 0x061D, 0x049E, 0x0405, 0x0409, 0x0DBE, 0x079F,
			
 
				+	0x0D21, 0x04C1, 0x0C0A, 0x0E13, 0x04AD, 0x040E, 0x0581, 0x0419,
			
 
				+	0x05DD, 0x0D03, 0x049D, 0x0449, 0x0429, 0x048E, 0x0DA1, 0x0425,
			
 
				+	0x0512, 0x0501, 0x0431, 0x0523, 0x0441, 0x042D, 0x040F, 0x0D7D,
			
 
				+	0x0511, 0x0502, 0x05BF, 0x04A1, 0x0C03, 0x0402, 0x079D, 0x05AE,
			
 
				+	0x075D, 0x057F, 0x041D, 0x048D, 0x042A, 0x0453, 0x05AF, 0x078D,
			
 
				+	0x0C0D, 0x073D, 0x0491, 0x0591, 0x05BD, 0x072D, 0x057E, 0x051F,
			
 
				+	0x0482, 0x0492, 0x041E, 0x0412, 0x0D9F, 0x0421, 0x0493, 0x0711,
			
 
				+	0x056E, 0x059E, 0x054E, 0x0611, 0x05ED, 0x074D, 0x070F, 0x056F,
			
 
				+	0x052F, 0x053F, 0x071F, 0x054F, 0x05CD, 0x0483, 0x055E, 0x072F,
			
 
				+	0x0E01, 0x0426, 0x058F, 0x0413, 0x078F, 0x071D, 0x055F, 0x058E,
			
 
				+	0x0411, 0x053E, 0x071E, 0x055D, 0x077E, 0x052E, 0x0692, 0x0417,
			
 
				+	0x070D, 0x078E, 0x070E, 0x072E, 0x041B, 0x060E, 0x0622, 0x0683,
			
 
				+	0x068D, 0x0702, 0x073E, 0x076F, 0x0781, 0x079E, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_8x8 {
			
 
				+	8, 8,
			
 
				+	{ 347, 208 },
			
 
				+	{ 1144, 267 },
			
 
				+	{ 0, 38 },
			
 
				+	{ percentile_arr_8x8_0, percentile_arr_8x8_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 5)
			
 
				+static const uint16_t percentile_arr_10x5_0[274] {
			
 
				+	0x0165, 0xF975, 0xD866, 0xC056, 0xA946, 0x90C6, 0x90F5, 0x8963,
			
 
				+	0x80D6, 0x80E6, 0x60F3, 0x61C3, 0x59F2, 0xA927, 0x5075, 0x4847,
			
 
				+	0x5153, 0x4955, 0x49E2, 0x48B6, 0x41D2, 0x4943, 0x8305, 0x8172,
			
 
				+	0x4046, 0x4037, 0x40A7, 0x70B7, 0x7AC1, 0x31E3, 0x7027, 0x30E5,
			
 
				+	0x69D3, 0x99B3, 0x3315, 0x6115, 0x3136, 0x3076, 0x3173, 0x30D5,
			
 
				+	0x3106, 0x8962, 0x2916, 0x30C7, 0x5126, 0x30D3, 0x2956, 0x5117,
			
 
				+	0x2B41, 0x2AE1, 0x2A61, 0x29F1, 0x2306, 0x2145, 0x4A85, 0x2057,
			
 
				+	0x40E3, 0x4137, 0x3B21, 0x23C1, 0x2065, 0x1925, 0x51C2, 0x5225,
			
 
				+	0x4935, 0x1AD1, 0x23A1, 0x19D1, 0x1A71, 0x4055, 0x1873, 0x1A86,
			
 
				+	0x1295, 0x18F2, 0x28A6, 0x1952, 0x4AA5, 0x20B5, 0x10C5, 0x2AA2,
			
 
				+	0x11E1, 0x1107, 0x10D2, 0x2171, 0x1351, 0x3036, 0x1331, 0x1BEE,
			
 
				+	0x2035, 0x1045, 0x1313, 0x0A15, 0x1087, 0x1296, 0x13EF, 0x18E2,
			
 
				+	0x1151, 0x1086, 0x10F1, 0x08A5, 0x12C2, 0x1BFF, 0x1095, 0x1A62,
			
 
				+	0x1322, 0x0942, 0x1026, 0x1872, 0x1062, 0x0897, 0x1123, 0x08D1,
			
 
				+	0x1A06, 0x0806, 0x137F, 0x13B1, 0x13DF, 0x1A51, 0x09B1, 0x0A83,
			
 
				+	0x1015, 0x22F1, 0x0961, 0x0B81, 0x12B3, 0x0A35, 0x0AA3, 0x20B3,
			
 
				+	0x08C3, 0x2342, 0x0933, 0x0A33, 0x09A2, 0x10C2, 0x0896, 0x2205,
			
 
				+	0x0825, 0x20E1, 0x0922, 0x1242, 0x0B16, 0x0B32, 0x09A3, 0x0AC3,
			
 
				+	0x0BBF, 0x0B93, 0x0071, 0x0931, 0x0A41, 0x2392, 0x13FE, 0x09C1,
			
 
				+	0x0B07, 0x0016, 0x1182, 0x09B2, 0x0A26, 0x0132, 0x0941, 0x0A93,
			
 
				+	0x0992, 0x1063, 0x1217, 0x01FF, 0x11EE, 0x1216, 0x0B23, 0x0B82,
			
 
				+	0x0042, 0x1102, 0x0213, 0x0B6F, 0x09FE, 0x1207, 0x0807, 0x18B1,
			
 
				+	0x0253, 0x0AB1, 0x08A2, 0x13FD, 0x01FD, 0x1983, 0x0AB2, 0x0A31,
			
 
				+	0x016F, 0x0B11, 0x00B2, 0x0851, 0x0AD2, 0x0993, 0x0BDD, 0x12A1,
			
 
				+	0x017F, 0x0A97, 0x1022, 0x0383, 0x0843, 0x0A52, 0x03A2, 0x097E,
			
 
				+	0x0817, 0x03B2, 0x0A43, 0x09EF, 0x0A63, 0x0B33, 0x0B03, 0x0292,
			
 
				+	0x0272, 0x09CE, 0x0287, 0x136D, 0x0053, 0x0B12, 0x0083, 0x0892,
			
 
				+	0x0112, 0x1282, 0x03ED, 0x0852, 0x0301, 0x1391, 0x0232, 0x0B7E,
			
 
				+	0x0221, 0x08A3, 0x0BCD, 0x0BCF, 0x036E, 0x09DE, 0x0103, 0x03DE,
			
 
				+	0x0832, 0x0BAF, 0x0302, 0x13CE, 0x035F, 0x0093, 0x0A23, 0x01DF,
			
 
				+	0x0013, 0x0A22, 0x0023, 0x0113, 0x09AF, 0x01BF, 0x0033, 0x095F,
			
 
				+	0x0203, 0x0281, 0x09CF, 0x037D, 0x0201, 0x0B4D, 0x03AE, 0x03BE,
			
 
				+	0x0291, 0x035E, 0x038E, 0x0B9F, 0x03AD, 0x0202, 0x034F, 0x0211,
			
 
				+	0x035D, 0x0212, 0x032E, 0x039E, 0x033F, 0x034E, 0x03BD, 0x032F,
			
 
				+	0x033E, 0x038F
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_10x5_1[180] {
			
 
				+	0x0532, 0xFCA3, 0x3621, 0x6E82, 0x2CC2, 0x3D51, 0x3F01, 0x2691,
			
 
				+	0x17AE, 0x35A2, 0x74B3, 0x1603, 0x4433, 0x3C43, 0x6C35, 0x25D1,
			
 
				+	0x1D13, 0x15DF, 0x37CD, 0x0D93, 0x1D22, 0x0E81, 0x1452, 0x0CD2,
			
 
				+	0x37BE, 0x0CB2, 0x3407, 0x1523, 0x0C16, 0x0CB5, 0x0C96, 0x1486,
			
 
				+	0x2631, 0x1506, 0x0F4F, 0x1583, 0x0CD1, 0x2CA2, 0x2612, 0x1613,
			
 
				+	0x1602, 0x1F11, 0x179F, 0x17BD, 0x15B1, 0x0406, 0x1D41, 0x0CF1,
			
 
				+	0x0D31, 0x0442, 0x1C62, 0x0F6E, 0x077D, 0x0C51, 0x0445, 0x0D15,
			
 
				+	0x2592, 0x0CB1, 0x05EF, 0x0542, 0x17AF, 0x1425, 0x075E, 0x0FAD,
			
 
				+	0x0CC1, 0x0503, 0x0512, 0x15C1, 0x0C95, 0x0415, 0x0505, 0x0F4E,
			
 
				+	0x04A5, 0x0493, 0x0C32, 0x0F5F, 0x04E1, 0x0521, 0x0C85, 0x07DD,
			
 
				+	0x0582, 0x15FF, 0x05CF, 0x0405, 0x0D91, 0x05A1, 0x05FE, 0x0C23,
			
 
				+	0x0561, 0x0472, 0x0471, 0x0C22, 0x0DEE, 0x076D, 0x0502, 0x0426,
			
 
				+	0x0C61, 0x0D7D, 0x0525, 0x05DE, 0x0DCE, 0x079D, 0x0692, 0x0441,
			
 
				+	0x0C91, 0x05DD, 0x0511, 0x057F, 0x0611, 0x0DFD, 0x078D, 0x056E,
			
 
				+	0x0492, 0x04A1, 0x073F, 0x0C31, 0x05BE, 0x0483, 0x0571, 0x056F,
			
 
				+	0x0D9F, 0x0581, 0x0501, 0x057E, 0x05BF, 0x078F, 0x0516, 0x05ED,
			
 
				+	0x0402, 0x0F7E, 0x0482, 0x054E, 0x075D, 0x071F, 0x05CD, 0x0535,
			
 
				+	0x05AE, 0x0C11, 0x058F, 0x05AF, 0x0421, 0x0413, 0x0601, 0x054F,
			
 
				+	0x073D, 0x059E, 0x0487, 0x070F, 0x078E, 0x0781, 0x053E, 0x0403,
			
 
				+	0x072D, 0x055D, 0x05BD, 0x079E, 0x0D8E, 0x0412, 0x052E, 0x074D,
			
 
				+	0x053F, 0x051F, 0x070E, 0x055F, 0x072F, 0x052F, 0x070D, 0x055E,
			
 
				+	0x0417, 0x0453, 0x072E, 0x0622, 0x0683, 0x0702, 0x071D, 0x071E,
			
 
				+	0x073E, 0x076F, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_10x5 {
			
 
				+	10, 5,
			
 
				+	{ 274, 180 },
			
 
				+	{ 954, 324 },
			
 
				+	{ 0, 79 },
			
 
				+	{ percentile_arr_10x5_0, percentile_arr_10x5_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 6)
			
 
				+static const uint16_t percentile_arr_10x6_0[325] {
			
 
				+	0x01A4, 0xF954, 0xA066, 0x9975, 0x80F5, 0x7056, 0x6918, 0x6963,
			
 
				+	0x58C6, 0x5946, 0x5928, 0x5174, 0x586F, 0xA0E6, 0x5108, 0x48D6,
			
 
				+	0x49E2, 0x40F3, 0x9172, 0x41F2, 0xB875, 0x3927, 0x39C3, 0xA953,
			
 
				+	0x3934, 0x3305, 0x30B6, 0x6943, 0x31D2, 0x3876, 0x3037, 0x2955,
			
 
				+	0x30A7, 0x32C1, 0x29B3, 0x3027, 0x287E, 0x30B7, 0x29E3, 0x5846,
			
 
				+	0x2B15, 0x2847, 0x3162, 0x5173, 0x4936, 0x285F, 0x48D3, 0x2164,
			
 
				+	0x4906, 0x20E5, 0x2915, 0x2116, 0x407F, 0x20D5, 0x2A61, 0x4117,
			
 
				+	0x20E3, 0x2126, 0x4148, 0x206E, 0x39D3, 0x2145, 0x41B4, 0x1B06,
			
 
				+	0x2114, 0x2165, 0x5321, 0x5A85, 0x1A4D, 0x1A1F, 0x19F1, 0x3341,
			
 
				+	0x184F, 0x1956, 0x3125, 0x30C7, 0x28F2, 0x1937, 0x1AE1, 0x1073,
			
 
				+	0x1BA1, 0x1935, 0x110C, 0x1BC1, 0x3A25, 0x19C2, 0x1295, 0x122E,
			
 
				+	0x1944, 0x11D1, 0x1124, 0x1857, 0x22D1, 0x2286, 0x1A2D, 0x12A2,
			
 
				+	0x2107, 0x1055, 0x2065, 0x0A71, 0x2152, 0x10C5, 0x10D2, 0x1331,
			
 
				+	0x08B5, 0x1171, 0x2836, 0x10A6, 0x0904, 0x123D, 0x20F1, 0x12A5,
			
 
				+	0x10E2, 0x107D, 0x1AF1, 0x1313, 0x0951, 0x11E1, 0x1B22, 0x1B51,
			
 
				+	0x0835, 0x101E, 0x0A5D, 0x0A15, 0x3045, 0x0A96, 0x08A5, 0x1142,
			
 
				+	0x12A3, 0x1872, 0x085D, 0x09B1, 0x100E, 0x0887, 0x0886, 0x086D,
			
 
				+	0x0933, 0x12B3, 0x0897, 0x08B3, 0x0A33, 0x0923, 0x1095, 0x0BEE,
			
 
				+	0x2BB1, 0x085E, 0x1283, 0x0A51, 0x1026, 0x0A06, 0x12C2, 0x08D1,
			
 
				+	0x11A2, 0x13BF, 0x08C3, 0x10C2, 0x0A3E, 0x0BDF, 0x0B81, 0x13EF,
			
 
				+	0x0A35, 0x0B16, 0x082F, 0x2161, 0x1B32, 0x0806, 0x084E, 0x11A3,
			
 
				+	0x1015, 0x1122, 0x2931, 0x0342, 0x0825, 0x0A0F, 0x0896, 0x0A05,
			
 
				+	0x0241, 0x09C1, 0x083F, 0x0A42, 0x0071, 0x0B07, 0x082E, 0x0393,
			
 
				+	0x12B1, 0x0A62, 0x0226, 0x0A2F, 0x0B92, 0x0063, 0x0932, 0x0862,
			
 
				+	0x09FF, 0x0A31, 0x00E1, 0x12B2, 0x09B2, 0x0AC3, 0x0941, 0x0293,
			
 
				+	0x1323, 0x104D, 0x003E, 0x083D, 0x0992, 0x1382, 0x03FF, 0x0A13,
			
 
				+	0x1016, 0x0A53, 0x0182, 0x1007, 0x0AA1, 0x080F, 0x0A16, 0x0A1E,
			
 
				+	0x0042, 0x0902, 0x13DD, 0x0BB2, 0x0A63, 0x00A2, 0x08B1, 0x03FE,
			
 
				+	0x1207, 0x08B2, 0x0B83, 0x09EE, 0x0311, 0x0A87, 0x0BAF, 0x03A2,
			
 
				+	0x09FD, 0x0051, 0x0B33, 0x020D, 0x09CE, 0x0217, 0x021D, 0x0817,
			
 
				+	0x020E, 0x0A4E, 0x001F, 0x0BFD, 0x0297, 0x0983, 0x0A92, 0x0252,
			
 
				+	0x0243, 0x0B03, 0x0193, 0x036F, 0x0B12, 0x0043, 0x0822, 0x0A21,
			
 
				+	0x01FE, 0x0853, 0x037F, 0x023F, 0x0BED, 0x02D2, 0x0B91, 0x0232,
			
 
				+	0x0282, 0x0912, 0x08A3, 0x0852, 0x0223, 0x0BCD, 0x0083, 0x0301,
			
 
				+	0x0832, 0x01EF, 0x0892, 0x0302, 0x0A72, 0x03DE, 0x0893, 0x0BCF,
			
 
				+	0x09DE, 0x03CE, 0x035F, 0x0833, 0x0023, 0x0103, 0x017E, 0x0813,
			
 
				+	0x01CF, 0x01BF, 0x016F, 0x0A22, 0x037E, 0x0113, 0x01AF, 0x0B6E,
			
 
				+	0x03BE, 0x0201, 0x0A03, 0x01DF, 0x036D, 0x03AE, 0x015F, 0x0281,
			
 
				+	0x033E, 0x0A02, 0x038E, 0x017F, 0x0291, 0x034D, 0x03BD, 0x0B7D,
			
 
				+	0x03AD, 0x0211, 0x0212, 0x034F, 0x032E, 0x039F, 0x034E, 0x035D,
			
 
				+	0x035E, 0x033F, 0x039E, 0x032F, 0x038F
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_10x6_1[199] {
			
 
				+	0x0621, 0xBD32, 0x5CA3, 0x1FAE, 0x64C2, 0x1D51, 0x6C33, 0xFC43,
			
 
				+	0x5CB3, 0x25A2, 0x2E82, 0x35D1, 0x4F01, 0x3FBE, 0x3691, 0x2DDF,
			
 
				+	0x2E03, 0x3FCD, 0x14D2, 0x1CF1, 0x0C52, 0x3C35, 0x2D22, 0x1513,
			
 
				+	0x1462, 0x54B2, 0x0E31, 0x4E81, 0x1593, 0x1D23, 0x1CD1, 0x14B5,
			
 
				+	0x2FBD, 0x0C07, 0x1D06, 0x0DEF, 0x14A2, 0x1612, 0x1F4F, 0x0C16,
			
 
				+	0x1F7D, 0x0C96, 0x0486, 0x1F9F, 0x0D42, 0x4583, 0x0E02, 0x0472,
			
 
				+	0x0DB1, 0x1613, 0x0FAD, 0x0D41, 0x0F11, 0x0E0D, 0x1C42, 0x143E,
			
 
				+	0x076E, 0x04B1, 0x0FAF, 0x0D61, 0x0531, 0x0C71, 0x0DFF, 0x0DFE,
			
 
				+	0x0406, 0x0C45, 0x0451, 0x0D15, 0x05C1, 0x2CC1, 0x141F, 0x0CE1,
			
 
				+	0x0FDD, 0x0C22, 0x0582, 0x0D92, 0x0571, 0x0F6D, 0x0C93, 0x045D,
			
 
				+	0x0F5E, 0x044D, 0x0423, 0x0D05, 0x0425, 0x0C95, 0x04A5, 0x0DCE,
			
 
				+	0x075F, 0x0E1D, 0x0503, 0x042E, 0x0D91, 0x0512, 0x0DDE, 0x05A1,
			
 
				+	0x074E, 0x0C32, 0x0431, 0x0415, 0x0D21, 0x05EE, 0x040E, 0x0DDD,
			
 
				+	0x0485, 0x1525, 0x0491, 0x0C26, 0x046D, 0x0C05, 0x05CF, 0x05FD,
			
 
				+	0x0E92, 0x073F, 0x0C0D, 0x043D, 0x0502, 0x0C1E, 0x041D, 0x0461,
			
 
				+	0x04A1, 0x0511, 0x0581, 0x05BD, 0x0C41, 0x059F, 0x05BF, 0x040F,
			
 
				+	0x0C7D, 0x0402, 0x054E, 0x057D, 0x0403, 0x078D, 0x05AE, 0x042D,
			
 
				+	0x0483, 0x079D, 0x0D7F, 0x0482, 0x0611, 0x056E, 0x0516, 0x05BE,
			
 
				+	0x0535, 0x044E, 0x05AF, 0x0DED, 0x042F, 0x0492, 0x058E, 0x078F,
			
 
				+	0x0412, 0x057E, 0x053E, 0x0F1F, 0x073D, 0x0601, 0x0501, 0x075D,
			
 
				+	0x059E, 0x05CD, 0x053F, 0x054F, 0x055E, 0x055D, 0x0421, 0x074D,
			
 
				+	0x051F, 0x072F, 0x0781, 0x0411, 0x0D6F, 0x077E, 0x0487, 0x070E,
			
 
				+	0x070F, 0x072D, 0x058F, 0x078E, 0x079E, 0x052E, 0x0413, 0x072E,
			
 
				+	0x071D, 0x052F, 0x055F, 0x073E, 0x0417, 0x0453, 0x060E, 0x0622,
			
 
				+	0x0683, 0x0702, 0x070D, 0x071E, 0x076F, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_10x6 {
			
 
				+	10, 6,
			
 
				+	{ 325, 199 },
			
 
				+	{ 922, 381 },
			
 
				+	{ 0, 78 },
			
 
				+	{ percentile_arr_10x6_0, percentile_arr_10x6_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 8)
			
 
				+static const uint16_t percentile_arr_10x8_0[400] {
			
 
				+	0x0154, 0xAB34, 0xAD44, 0x8308, 0x7866, 0x7B64, 0x79A4, 0x7975,
			
 
				+	0x686A, 0x6908, 0xC514, 0x6174, 0x6128, 0x6118, 0x5B54, 0x5163,
			
 
				+	0xF856, 0x50F5, 0x986F, 0xDD34, 0x48FE, 0x4972, 0x48E6, 0x4146,
			
 
				+	0x48EE, 0x40F3, 0x4AC1, 0x38C6, 0x41E2, 0xBB05, 0x707E, 0x38D6,
			
 
				+	0x3927, 0x6B14, 0x384B, 0x3948, 0x3153, 0x385A, 0x3134, 0x6B15,
			
 
				+	0x39F2, 0x30CF, 0x3143, 0x91D2, 0x31C3, 0x60EF, 0x5973, 0x3076,
			
 
				+	0x28D3, 0x3261, 0x2875, 0x28DE, 0x290C, 0x51E3, 0x28A7, 0x20E3,
			
 
				+	0x2962, 0x2B06, 0x2917, 0x483B, 0x20B6, 0x2D24, 0x206E, 0x285F,
			
 
				+	0x20B7, 0x2936, 0x4047, 0x2037, 0x20DF, 0x28BF, 0x21B4, 0x21B3,
			
 
				+	0x1D08, 0x2027, 0x404F, 0x3846, 0x2116, 0x187F, 0x1879, 0x2285,
			
 
				+	0x1A29, 0x3915, 0x4873, 0x1955, 0x3114, 0x1B44, 0x2165, 0x107A,
			
 
				+	0x1956, 0x6137, 0x1106, 0x3145, 0x1B21, 0x19D3, 0x12AD, 0x1B41,
			
 
				+	0x1AD1, 0x1126, 0x18F2, 0x282B, 0x40E5, 0x20D5, 0x2A0A, 0x284A,
			
 
				+	0x1286, 0x1295, 0x121A, 0x2A0B, 0x321B, 0x122D, 0x10FD, 0x13A1,
			
 
				+	0x32A2, 0x12E1, 0x1164, 0x13C1, 0x124D, 0x1239, 0x4504, 0x10C7,
			
 
				+	0x22F1, 0x11F1, 0x0AC2, 0x2125, 0x1225, 0x0B04, 0x1107, 0x1069,
			
 
				+	0x1A19, 0x13BF, 0x2A96, 0x08D2, 0x1271, 0x0952, 0x2BDF, 0x0B31,
			
 
				+	0x1251, 0x2124, 0x0B13, 0x12BD, 0x1233, 0x13EE, 0x2144, 0x0B16,
			
 
				+	0x0A15, 0x18E2, 0x08DD, 0x1097, 0x0857, 0x0B24, 0x0AA5, 0x12A3,
			
 
				+	0x11C2, 0x11D1, 0x10CE, 0x0865, 0x123D, 0x08B3, 0x0B51, 0x1971,
			
 
				+	0x0A41, 0x0A06, 0x1039, 0x080A, 0x0B22, 0x0923, 0x0836, 0x08C3,
			
 
				+	0x0A1F, 0x1072, 0x080B, 0x0935, 0x0855, 0x18A6, 0x0A42, 0x1133,
			
 
				+	0x0A83, 0x0A09, 0x0ACD, 0x0A2E, 0x0887, 0x083A, 0x10C5, 0x085E,
			
 
				+	0x13B1, 0x087D, 0x0819, 0x0A9F, 0x0049, 0x08F1, 0x0BEF, 0x1161,
			
 
				+	0x0B42, 0x09E1, 0x0A05, 0x0904, 0x12AE, 0x029E, 0x0A31, 0x09FF,
			
 
				+	0x0951, 0x0859, 0x001A, 0x082F, 0x0B81, 0x08B5, 0x0A35, 0x082A,
			
 
				+	0x08ED, 0x1142, 0x1262, 0x0B32, 0x08A5, 0x12D2, 0x03DD, 0x0B07,
			
 
				+	0x18AE, 0x083F, 0x00AF, 0x0AB3, 0x086D, 0x0287, 0x0A93, 0x025D,
			
 
				+	0x0816, 0x13FF, 0x0A8D, 0x005D, 0x08D1, 0x0392, 0x0845, 0x0AC3,
			
 
				+	0x08C2, 0x01A3, 0x0AB1, 0x09A2, 0x005B, 0x0B93, 0x02B2, 0x1086,
			
 
				+	0x001B, 0x0863, 0x0216, 0x0AA1, 0x0896, 0x0A8F, 0x084E, 0x0A8E,
			
 
				+	0x0A53, 0x0026, 0x0A26, 0x0382, 0x0807, 0x0862, 0x0029, 0x0871,
			
 
				+	0x00BD, 0x0835, 0x024E, 0x0806, 0x0941, 0x0895, 0x03AF, 0x0A13,
			
 
				+	0x0932, 0x03ED, 0x0BFD, 0x0207, 0x0B83, 0x0993, 0x09B1, 0x03CD,
			
 
				+	0x0A3E, 0x03FE, 0x0A21, 0x0015, 0x0B11, 0x0A43, 0x00E1, 0x136F,
			
 
				+	0x00BE, 0x00A2, 0x0842, 0x0043, 0x0825, 0x082E, 0x0A2A, 0x03DE,
			
 
				+	0x0BA2, 0x0122, 0x0BCF, 0x004D, 0x0323, 0x09C1, 0x0292, 0x083E,
			
 
				+	0x0252, 0x0017, 0x0A72, 0x00CD, 0x0182, 0x0A63, 0x0131, 0x09B2,
			
 
				+	0x0303, 0x0902, 0x0053, 0x035F, 0x0A32, 0x003D, 0x0992, 0x0A2F,
			
 
				+	0x03B2, 0x0ABE, 0x009F, 0x0183, 0x0312, 0x08B1, 0x0B02, 0x0A17,
			
 
				+	0x0B7F, 0x0333, 0x0297, 0x0A23, 0x020F, 0x0282, 0x0851, 0x0822,
			
 
				+	0x03CE, 0x01EE, 0x000E, 0x08B2, 0x0083, 0x0A1D, 0x00A3, 0x0222,
			
 
				+	0x088F, 0x0112, 0x029D, 0x0092, 0x0A3F, 0x0391, 0x089E, 0x0301,
			
 
				+	0x01FD, 0x09BF, 0x01CE, 0x0852, 0x01FE, 0x0013, 0x0903, 0x088E,
			
 
				+	0x037E, 0x021E, 0x01EF, 0x095F, 0x016F, 0x09DE, 0x03BE, 0x020E,
			
 
				+	0x0113, 0x01DF, 0x080F, 0x020D, 0x0833, 0x03AE, 0x0032, 0x03BD,
			
 
				+	0x0823, 0x001E, 0x01AF, 0x0203, 0x034F, 0x0093, 0x0A81, 0x036E,
			
 
				+	0x0291, 0x038E, 0x0A01, 0x001F, 0x017F, 0x01CF, 0x017E, 0x0202,
			
 
				+	0x0BAD, 0x0211, 0x035D, 0x035E, 0x039F, 0x0212, 0x032E, 0x033F,
			
 
				+	0x034D, 0x034E, 0x036D, 0x032F, 0x033E, 0x037D, 0x038F, 0x039E
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_10x8_1[221] {
			
 
				+	0x0621, 0xDFAE, 0x2443, 0x54C2, 0x37CD, 0x1CF1, 0xFCA3, 0x14D2,
			
 
				+	0x2D32, 0x5551, 0x7DDF, 0x5C33, 0x15D1, 0x3462, 0x24B3, 0x7452,
			
 
				+	0x5FBE, 0x6472, 0x65A2, 0x1D06, 0x445D, 0x15EF, 0x0E31, 0x1D71,
			
 
				+	0x343E, 0x0D42, 0x0CDD, 0x1F01, 0x4691, 0x1435, 0x0E82, 0x0DFF,
			
 
				+	0x17DD, 0x0D22, 0x24B2, 0x1603, 0x04B5, 0x24AE, 0x060D, 0x2D13,
			
 
				+	0x0C7D, 0x0496, 0x17BD, 0x1F4F, 0x1F7D, 0x1486, 0x0593, 0x1C16,
			
 
				+	0x0C07, 0x15FE, 0x041F, 0x14D1, 0x0C9F, 0x0E81, 0x0D15, 0x27AF,
			
 
				+	0x0C2E, 0x0D23, 0x176E, 0x0FAD, 0x1C06, 0x1561, 0x0DB1, 0x040B,
			
 
				+	0x1C4E, 0x0D83, 0x1711, 0x0C42, 0x0C71, 0x1C1A, 0x0D25, 0x04A2,
			
 
				+	0x0C45, 0x076D, 0x0F9F, 0x075F, 0x0E12, 0x046D, 0x048F, 0x1D92,
			
 
				+	0x0602, 0x0C39, 0x174E, 0x0C51, 0x0CA1, 0x075E, 0x05C1, 0x14BD,
			
 
				+	0x0D31, 0x0423, 0x0F3F, 0x0495, 0x0C93, 0x049E, 0x0D05, 0x04E1,
			
 
				+	0x0DEE, 0x0415, 0x04B1, 0x0503, 0x0CCD, 0x042F, 0x0DCF, 0x044D,
			
 
				+	0x0541, 0x1582, 0x05DE, 0x0D01, 0x0487, 0x040A, 0x0516, 0x0CA5,
			
 
				+	0x05FD, 0x05BF, 0x057D, 0x0DA1, 0x0426, 0x040F, 0x071F, 0x0613,
			
 
				+	0x0432, 0x0D12, 0x043D, 0x0425, 0x0461, 0x061D, 0x0D21, 0x0591,
			
 
				+	0x079D, 0x048D, 0x0429, 0x0C49, 0x04C1, 0x042A, 0x040E, 0x0485,
			
 
				+	0x0511, 0x0405, 0x0502, 0x0441, 0x0C19, 0x0692, 0x0535, 0x058F,
			
 
				+	0x041D, 0x059F, 0x072D, 0x04AD, 0x049D, 0x05CE, 0x048E, 0x0C31,
			
 
				+	0x057F, 0x078D, 0x0409, 0x041E, 0x05AE, 0x0611, 0x058E, 0x05DD,
			
 
				+	0x05CD, 0x056E, 0x0483, 0x073D, 0x054E, 0x0D9E, 0x0402, 0x0491,
			
 
				+	0x040D, 0x056F, 0x042D, 0x0581, 0x0421, 0x057E, 0x0781, 0x053E,
			
 
				+	0x0482, 0x078F, 0x0413, 0x052E, 0x0601, 0x0422, 0x0492, 0x055E,
			
 
				+	0x05BE, 0x0F9E, 0x072F, 0x074D, 0x0412, 0x070F, 0x075D, 0x05BD,
			
 
				+	0x051F, 0x071D, 0x073E, 0x077E, 0x0403, 0x0411, 0x078E, 0x055D,
			
 
				+	0x05AF, 0x05ED, 0x052F, 0x053F, 0x070D, 0x070E, 0x072E, 0x054F,
			
 
				+	0x0417, 0x041B, 0x0453, 0x055F, 0x060E, 0x0622, 0x0683, 0x068D,
			
 
				+	0x0702, 0x071E, 0x076F, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_10x8 =
			
 
				+{
			
 
				+	10, 8,
			
 
				+	{ 400, 221 },
			
 
				+	{ 1119, 376 },
			
 
				+	{ 0, 52 },
			
 
				+	{ percentile_arr_10x8_0, percentile_arr_10x8_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 10)
			
 
				+static const uint16_t percentile_arr_10x10_0[453] {
			
 
				+	0x0334, 0x9514, 0x8954, 0x806A, 0x6F14, 0x6724, 0x6108, 0x6364,
			
 
				+	0x5175, 0x5D44, 0x5866, 0x5118, 0x5308, 0xA179, 0x5128, 0xF534,
			
 
				+	0x49A4, 0x5354, 0x9174, 0x486F, 0x48EA, 0x40F3, 0x4963, 0x414A,
			
 
				+	0xF8F9, 0x3984, 0x4172, 0x387E, 0x405A, 0x38DA, 0x38F5, 0x9B05,
			
 
				+	0x30EE, 0x32C1, 0x3261, 0x3D08, 0x31E2, 0x3056, 0x292B, 0x3146,
			
 
				+	0x3127, 0x3315, 0x58CA, 0x58E6, 0x290C, 0x3314, 0x8134, 0x28E3,
			
 
				+	0x28FE, 0x2948, 0x28C6, 0x78DE, 0x28BB, 0x68D6, 0x286E, 0x2173,
			
 
				+	0x2962, 0x21D2, 0x205F, 0x49F2, 0x2917, 0x2306, 0x207F, 0x404F,
			
 
				+	0x2153, 0x2943, 0x20CF, 0x21C3, 0x2073, 0x20D3, 0x2136, 0x183B,
			
 
				+	0x430A, 0x40A7, 0x18B6, 0x2079, 0x2309, 0x2075, 0x184B, 0x20EF,
			
 
				+	0x187A, 0x7837, 0x1B19, 0x20AB, 0x18BA, 0x20B7, 0x1994, 0x19E3,
			
 
				+	0x21B4, 0x49B3, 0x38BF, 0x193B, 0x1876, 0x182B, 0x30F2, 0x193A,
			
 
				+	0x1827, 0x1965, 0x1914, 0x184A, 0x4047, 0x1916, 0x1285, 0x1937,
			
 
				+	0x122D, 0x1915, 0x1321, 0x1955, 0x1046, 0x191B, 0x2106, 0x2919,
			
 
				+	0x1344, 0x1524, 0x12E1, 0x3926, 0x10E5, 0x2295, 0x1159, 0x1145,
			
 
				+	0x10DF, 0x124D, 0x1271, 0x092A, 0x2169, 0x1704, 0x22A2, 0x1164,
			
 
				+	0x13EE, 0x12F1, 0x0AD1, 0x128A, 0x110A, 0x11D3, 0x1286, 0x115A,
			
 
				+	0x2BA1, 0x0BBF, 0x3956, 0x2A89, 0x12AD, 0x10E9, 0x0B41, 0x1A29,
			
 
				+	0x2225, 0x08FD, 0x1107, 0x08D5, 0x191A, 0x1125, 0x1A96, 0x0B04,
			
 
				+	0x18D9, 0x2B16, 0x11F1, 0x0A33, 0x0924, 0x131A, 0x1149, 0x1324,
			
 
				+	0x0BEF, 0x0A99, 0x08CB, 0x123D, 0x1331, 0x0BDF, 0x0872, 0x22A3,
			
 
				+	0x0AC2, 0x1144, 0x0D04, 0x08D2, 0x08CE, 0x0AA9, 0x0A9A, 0x0B13,
			
 
				+	0x1251, 0x0865, 0x1069, 0x0897, 0x1215, 0x18B3, 0x1A62, 0x08C7,
			
 
				+	0x185E, 0x10E2, 0x0AA5, 0x21FF, 0x090B, 0x0952, 0x09E1, 0x0A42,
			
 
				+	0x08F1, 0x0A06, 0x0B22, 0x087D, 0x1139, 0x021F, 0x122E, 0x082F,
			
 
				+	0x09C2, 0x0887, 0x0A0A, 0x03C1, 0x0929, 0x0A5D, 0x0A83, 0x0BFF,
			
 
				+	0x0935, 0x085B, 0x0104, 0x08DD, 0x0923, 0x083F, 0x0241, 0x09D1,
			
 
				+	0x0A39, 0x0863, 0x0A8B, 0x08A6, 0x008B, 0x1133, 0x13B1, 0x089B,
			
 
				+	0x0AB3, 0x0036, 0x0BDD, 0x08ED, 0x0857, 0x0971, 0x0219, 0x1235,
			
 
				+	0x0AB1, 0x0ACD, 0x036F, 0x0A31, 0x08AA, 0x003A, 0x08C3, 0x0A05,
			
 
				+	0x02BD, 0x0B92, 0x0B07, 0x12B2, 0x08C5, 0x0B51, 0x0381, 0x0A8D,
			
 
				+	0x01A3, 0x0896, 0x0855, 0x0BFD, 0x005D, 0x0BFE, 0x023E, 0x08AF,
			
 
				+	0x00B9, 0x0A93, 0x00B5, 0x0862, 0x0A0B, 0x0A09, 0x0A72, 0x0332,
			
 
				+	0x0AA1, 0x08C9, 0x024E, 0x1382, 0x0951, 0x00A5, 0x0A2A, 0x0059,
			
 
				+	0x0A9E, 0x0B42, 0x004E, 0x0942, 0x03ED, 0x09B2, 0x02D2, 0x0849,
			
 
				+	0x0035, 0x0216, 0x0961, 0x0BAF, 0x00AE, 0x0826, 0x0287, 0x0A1A,
			
 
				+	0x0393, 0x0221, 0x09A2, 0x086D, 0x0226, 0x0871, 0x0039, 0x082A,
			
 
				+	0x08C2, 0x08E1, 0x0845, 0x0207, 0x0B23, 0x0015, 0x00D1, 0x0B83,
			
 
				+	0x037F, 0x0252, 0x08A9, 0x0099, 0x0A13, 0x0053, 0x0807, 0x03CD,
			
 
				+	0x0BDE, 0x0016, 0x089A, 0x0232, 0x035F, 0x0A8E, 0x0AC3, 0x022F,
			
 
				+	0x0263, 0x0829, 0x004D, 0x0132, 0x0806, 0x0311, 0x01B1, 0x0941,
			
 
				+	0x0086, 0x000B, 0x1122, 0x0025, 0x0842, 0x00BD, 0x0BCF, 0x03A2,
			
 
				+	0x0043, 0x0B03, 0x0895, 0x0A8F, 0x008A, 0x09EF, 0x0253, 0x0A1B,
			
 
				+	0x0182, 0x0243, 0x0A92, 0x00CD, 0x083E, 0x030B, 0x0223, 0x081A,
			
 
				+	0x0A9F, 0x0193, 0x00BE, 0x0017, 0x0931, 0x0391, 0x037E, 0x09C1,
			
 
				+	0x0312, 0x0333, 0x03B2, 0x083D, 0x08B1, 0x00B2, 0x002E, 0x021D,
			
 
				+	0x0A9D, 0x0192, 0x02AE, 0x0102, 0x0022, 0x081B, 0x0222, 0x009E,
			
 
				+	0x021E, 0x000A, 0x089F, 0x0217, 0x0BCE, 0x0052, 0x020F, 0x0A97,
			
 
				+	0x0282, 0x008E, 0x0A3F, 0x01FD, 0x00A3, 0x0019, 0x08A2, 0x0301,
			
 
				+	0x036E, 0x01FE, 0x03BE, 0x0ABE, 0x01CE, 0x0302, 0x029B, 0x0051,
			
 
				+	0x0883, 0x008F, 0x0BAE, 0x01DF, 0x0183, 0x0912, 0x000E, 0x020D,
			
 
				+	0x01EE, 0x0B4F, 0x0033, 0x0103, 0x020E, 0x0832, 0x01AF, 0x0913,
			
 
				+	0x01DE, 0x0203, 0x001E, 0x0092, 0x0093, 0x000F, 0x015F, 0x0291,
			
 
				+	0x0281, 0x0813, 0x001F, 0x01CF, 0x033F, 0x0023, 0x01BF, 0x0202,
			
 
				+	0x016F, 0x017E, 0x03AD, 0x0201, 0x034E, 0x0BBD, 0x036D, 0x017F,
			
 
				+	0x0211, 0x038E, 0x0212, 0x032E, 0x034D, 0x035E, 0x037D, 0x039E,
			
 
				+	0x032F, 0x033E, 0x035D, 0x038F, 0x039F
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_10x10_1[234] {
			
 
				+	0x07CD, 0x6E21, 0x24F1, 0x8443, 0xD7AE, 0x24C2, 0x1C62, 0xCCA3,
			
 
				+	0x1C33, 0xFDEF, 0x2532, 0x55DF, 0x1472, 0x6C3E, 0x14D2, 0x34DD,
			
 
				+	0x1452, 0x745D, 0x4D51, 0x8DD1, 0x247D, 0x75FF, 0x0CB3, 0x17BE,
			
 
				+	0x6CAE, 0x17DD, 0x1571, 0x3D06, 0x4E31, 0x0DA2, 0x67BD, 0x160D,
			
 
				+	0x2C4E, 0x0D22, 0x176E, 0x3CB2, 0x142E, 0x4DFE, 0x0F4F, 0x1435,
			
 
				+	0x0F01, 0x0D42, 0x0F7D, 0x0CB5, 0x1E03, 0x149F, 0x1C96, 0x141F,
			
 
				+	0x14B9, 0x0FAF, 0x0439, 0x0E91, 0x2682, 0x1D13, 0x1FAD, 0x0407,
			
 
				+	0x3471, 0x0C86, 0x0F6D, 0x0D15, 0x0D61, 0x040B, 0x0C6D, 0x0C16,
			
 
				+	0x0C9A, 0x0D0A, 0x0593, 0x0CD1, 0x248F, 0x0C2F, 0x3C42, 0x1523,
			
 
				+	0x0445, 0x0E81, 0x0CA2, 0x1525, 0x0406, 0x1C8A, 0x0C1A, 0x04BD,
			
 
				+	0x0F5E, 0x0F3F, 0x1F4E, 0x0E1D, 0x0423, 0x0DCF, 0x044D, 0x0D92,
			
 
				+	0x0583, 0x0DB1, 0x1449, 0x15EE, 0x0F5F, 0x079F, 0x0D19, 0x0409,
			
 
				+	0x04CD, 0x05FD, 0x143D, 0x0612, 0x0D03, 0x0D82, 0x04B1, 0x0C95,
			
 
				+	0x0C2A, 0x049E, 0x05AF, 0x0D31, 0x05BE, 0x04E1, 0x0D05, 0x0516,
			
 
				+	0x0711, 0x05C1, 0x0509, 0x0D41, 0x0493, 0x048E, 0x0602, 0x05BF,
			
 
				+	0x0CA5, 0x0529, 0x0535, 0x0D12, 0x0539, 0x0451, 0x0C29, 0x071F,
			
 
				+	0x040A, 0x0F3D, 0x0432, 0x059F, 0x0425, 0x0C99, 0x05DE, 0x05CE,
			
 
				+	0x0C0F, 0x0489, 0x051A, 0x0501, 0x0415, 0x057F, 0x0431, 0x0E13,
			
 
				+	0x040D, 0x041D, 0x075D, 0x0C53, 0x0502, 0x04C1, 0x049D, 0x0426,
			
 
				+	0x040E, 0x05A1, 0x055F, 0x0781, 0x0591, 0x04A9, 0x048B, 0x0D8E,
			
 
				+	0x052E, 0x0412, 0x0521, 0x0405, 0x04AD, 0x074D, 0x0611, 0x077E,
			
 
				+	0x078F, 0x078D, 0x048D, 0x041E, 0x0487, 0x0461, 0x0C85, 0x05ED,
			
 
				+	0x0402, 0x0483, 0x0419, 0x0511, 0x0491, 0x0482, 0x059E, 0x068D,
			
 
				+	0x055D, 0x072E, 0x05DD, 0x054E, 0x0441, 0x0422, 0x052F, 0x057D,
			
 
				+	0x072D, 0x079D, 0x0CA1, 0x072F, 0x079E, 0x0581, 0x042D, 0x055E,
			
 
				+	0x0601, 0x0413, 0x0692, 0x0403, 0x051F, 0x053F, 0x054F, 0x05CD,
			
 
				+	0x070F, 0x071D, 0x05AE, 0x05BD, 0x0492, 0x056E, 0x0411, 0x0417,
			
 
				+	0x041B, 0x0421, 0x053E, 0x056F, 0x057E, 0x058F, 0x060E, 0x0622,
			
 
				+	0x0683, 0x0702, 0x070D, 0x070E, 0x071E, 0x073E, 0x076F, 0x078E,
			
 
				+	0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_10x10 {
			
 
				+	10, 10,
			
 
				+	{ 453, 234 },
			
 
				+	{ 1095, 472 },
			
 
				+	{ 0, 70 },
			
 
				+	{ percentile_arr_10x10_0, percentile_arr_10x10_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 10)
			
 
				+static const uint16_t percentile_arr_12x10_0[491] {
			
 
				+	0x0334, 0x9954, 0x8514, 0x7128, 0x6364, 0xC174, 0x5D34, 0x5866,
			
 
				+	0x5975, 0x5354, 0xAF14, 0x506A, 0x5108, 0x5724, 0x5308, 0x4544,
			
 
				+	0x4918, 0x4064, 0x49E2, 0x4179, 0x8163, 0x4054, 0xF81C, 0x394A,
			
 
				+	0x38F3, 0x4172, 0x38F5, 0xA06F, 0x68EA, 0x69F2, 0x3134, 0x31A4,
			
 
				+	0x305A, 0x68DA, 0x3056, 0x3146, 0x31F5, 0x3148, 0x5A61, 0x32C1,
			
 
				+	0x31D2, 0x307E, 0x29E3, 0x30E6, 0x59C3, 0x2984, 0x29B6, 0x28F9,
			
 
				+	0x5204, 0x28EE, 0x50CA, 0x2997, 0x48C6, 0x4838, 0x2953, 0x200C,
			
 
				+	0x2943, 0x2173, 0x2D08, 0x4162, 0x29B4, 0x2314, 0x21B3, 0x212B,
			
 
				+	0x210C, 0x48E3, 0x60DE, 0x205F, 0x20FE, 0x2028, 0x21A6, 0x404F,
			
 
				+	0x20D6, 0x2214, 0x2127, 0x1873, 0x40CF, 0x206E, 0x1B09, 0x21C6,
			
 
				+	0x2075, 0x19D5, 0x2305, 0x18D3, 0x2076, 0x1804, 0x230A, 0x304B,
			
 
				+	0x20BB, 0x18B6, 0x1936, 0x1B19, 0x3037, 0x187F, 0x18A7, 0x1B85,
			
 
				+	0x30BA, 0x183B, 0x1027, 0x18EF, 0x1B21, 0x1879, 0x10AB, 0x1917,
			
 
				+	0x1114, 0x18BF, 0x1074, 0x1994, 0x2847, 0x111B, 0x28F2, 0x11E5,
			
 
				+	0x19A7, 0x113A, 0x1046, 0x28B7, 0x207A, 0x182B, 0x1155, 0x104A,
			
 
				+	0x1344, 0x293B, 0x11D3, 0x2014, 0x1044, 0x1018, 0x13A1, 0x1315,
			
 
				+	0x2524, 0x20DF, 0x10E5, 0x1126, 0x12A2, 0x1824, 0x2271, 0x11F1,
			
 
				+	0x2964, 0x12D1, 0x115A, 0x092A, 0x2341, 0x1A2D, 0x12E1, 0x090A,
			
 
				+	0x13BF, 0x0A4D, 0x2119, 0x0BC1, 0x1233, 0x1A8A, 0x2008, 0x1159,
			
 
				+	0x1A89, 0x08D5, 0x1156, 0x0834, 0x13EE, 0x1169, 0x1187, 0x1AA3,
			
 
				+	0x1229, 0x1331, 0x0A85, 0x0937, 0x1704, 0x08FD, 0x2124, 0x0B13,
			
 
				+	0x1251, 0x0AAD, 0x082C, 0x091A, 0x18D9, 0x0A99, 0x1848, 0x18E9,
			
 
				+	0x0B95, 0x1144, 0x0AF1, 0x1A25, 0x131A, 0x09C5, 0x0986, 0x1BDF,
			
 
				+	0x0B24, 0x0965, 0x1262, 0x0949, 0x0872, 0x09C2, 0x12C2, 0x0916,
			
 
				+	0x085E, 0x0B06, 0x08CB, 0x08C7, 0x1242, 0x1BEF, 0x0A9A, 0x1152,
			
 
				+	0x08B3, 0x0AA9, 0x090B, 0x08D2, 0x1B22, 0x0B04, 0x0865, 0x0A15,
			
 
				+	0x1286, 0x0A83, 0x0A95, 0x09D1, 0x0A06, 0x0196, 0x1139, 0x0A3D,
			
 
				+	0x0933, 0x13B1, 0x0123, 0x0D04, 0x08E2, 0x122E, 0x08A6, 0x00CE,
			
 
				+	0x0A31, 0x1241, 0x0B51, 0x1057, 0x1171, 0x007D, 0x1145, 0x0A0A,
			
 
				+	0x0129, 0x09FF, 0x089B, 0x085B, 0x0063, 0x0AB1, 0x0A1F, 0x0A5D,
			
 
				+	0x0AA5, 0x0036, 0x0904, 0x0B86, 0x0A8B, 0x0897, 0x11E1, 0x0332,
			
 
				+	0x083F, 0x0A19, 0x02B3, 0x0859, 0x08C3, 0x0855, 0x11B5, 0x01A5,
			
 
				+	0x0AB2, 0x0392, 0x10DD, 0x09A3, 0x00ED, 0x0907, 0x1161, 0x002F,
			
 
				+	0x0887, 0x0216, 0x0ABD, 0x0B81, 0x0A93, 0x0A21, 0x003A, 0x0ACD,
			
 
				+	0x0AA1, 0x0A35, 0x0272, 0x0BDD, 0x03FE, 0x0BAF, 0x0869, 0x0213,
			
 
				+	0x088B, 0x020B, 0x00B5, 0x1035, 0x08F1, 0x0151, 0x0A4E, 0x0239,
			
 
				+	0x0BA2, 0x00AA, 0x0896, 0x0382, 0x0A08, 0x0A05, 0x0A09, 0x0142,
			
 
				+	0x086D, 0x004E, 0x0B23, 0x0106, 0x0807, 0x036F, 0x0995, 0x03FD,
			
 
				+	0x08AF, 0x08C5, 0x0062, 0x0053, 0x0B42, 0x0826, 0x021A, 0x01A2,
			
 
				+	0x09B1, 0x00C9, 0x09B2, 0x0045, 0x0207, 0x08B9, 0x00A5, 0x0AD2,
			
 
				+	0x0095, 0x003E, 0x0A32, 0x0383, 0x0849, 0x0135, 0x029E, 0x0A26,
			
 
				+	0x023E, 0x0BFF, 0x0A52, 0x0311, 0x001B, 0x0915, 0x0A8D, 0x0223,
			
 
				+	0x022A, 0x0BED, 0x0086, 0x0A96, 0x0222, 0x035F, 0x0A43, 0x085D,
			
 
				+	0x0303, 0x0393, 0x0A63, 0x082A, 0x037F, 0x0932, 0x0043, 0x0292,
			
 
				+	0x03CD, 0x0BDE, 0x009F, 0x0125, 0x08A9, 0x0253, 0x0015, 0x0192,
			
 
				+	0x0A17, 0x08C2, 0x0316, 0x00D1, 0x0282, 0x0871, 0x0312, 0x0122,
			
 
				+	0x0A9F, 0x02AE, 0x0006, 0x0A8E, 0x08E1, 0x0016, 0x0B0B, 0x00AE,
			
 
				+	0x0025, 0x0193, 0x0AC3, 0x0017, 0x0307, 0x00BD, 0x08BE, 0x0039,
			
 
				+	0x0BB2, 0x021B, 0x01FD, 0x084D, 0x03CE, 0x00A3, 0x0302, 0x0BCF,
			
 
				+	0x0033, 0x0391, 0x028F, 0x0852, 0x0287, 0x008A, 0x0333, 0x080B,
			
 
				+	0x0131, 0x01C1, 0x037E, 0x0A0F, 0x00B1, 0x002E, 0x0099, 0x0902,
			
 
				+	0x009A, 0x003D, 0x0982, 0x0301, 0x00CD, 0x0941, 0x0042, 0x0183,
			
 
				+	0x029D, 0x08A2, 0x021D, 0x001A, 0x0A97, 0x01EF, 0x01CE, 0x0051,
			
 
				+	0x0BAE, 0x022F, 0x03BE, 0x021E, 0x000A, 0x09DF, 0x0029, 0x020D,
			
 
				+	0x02BE, 0x029B, 0x09EE, 0x00B2, 0x0912, 0x036E, 0x009E, 0x0022,
			
 
				+	0x0019, 0x0892, 0x0032, 0x01FE, 0x0083, 0x023F, 0x0B96, 0x000E,
			
 
				+	0x008F, 0x0113, 0x0103, 0x001E, 0x0A0E, 0x0013, 0x008E, 0x0281,
			
 
				+	0x09AF, 0x017E, 0x0203, 0x016F, 0x0291, 0x0023, 0x0093, 0x03BD,
			
 
				+	0x001F, 0x01CF, 0x01DE, 0x0201, 0x01BF, 0x0B4F, 0x000F, 0x0202,
			
 
				+	0x037D, 0x038E, 0x0211, 0x0212, 0x034E, 0x039F, 0x03AD, 0x015F,
			
 
				+	0x017F, 0x032E, 0x033F, 0x034D, 0x035E, 0x036D, 0x032F, 0x033E,
			
 
				+	0x035D, 0x038F, 0x039E
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_12x10_1[240] {
			
 
				+	0x0621, 0xA443, 0xFCC2, 0x3CA3, 0x1D32, 0x14F1, 0x7462, 0x1433,
			
 
				+	0x27CD, 0x2571, 0x57AE, 0x5DD1, 0x64B3, 0x44D2, 0x2C72, 0x25A2,
			
 
				+	0x1E31, 0x55DF, 0x4C52, 0x1DEF, 0x0D51, 0x3C5D, 0x3C3E, 0x74DD,
			
 
				+	0x347D, 0x27BE, 0x5CB5, 0x17DD, 0x2C14, 0x0CAE, 0x24B2, 0x15FF,
			
 
				+	0x2701, 0x0D42, 0x1FBD, 0x0C35, 0x1603, 0x060D, 0x1D93, 0x0C96,
			
 
				+	0x1C07, 0x1522, 0x0D06, 0x0F4F, 0x0C9F, 0x1F6E, 0x0D86, 0x0C2E,
			
 
				+	0x1DFE, 0x0682, 0x1E91, 0x0F7D, 0x0C86, 0x040B, 0x1513, 0x044E,
			
 
				+	0x14D1, 0x0C39, 0x14B9, 0x1C71, 0x05B1, 0x0C1F, 0x0681, 0x1445,
			
 
				+	0x0C16, 0x0D95, 0x1583, 0x0D61, 0x0FAD, 0x1442, 0x048F, 0x0D0A,
			
 
				+	0x049A, 0x0F6D, 0x146D, 0x0C2F, 0x0D25, 0x0406, 0x0C1A, 0x0D23,
			
 
				+	0x0612, 0x0FAF, 0x0F11, 0x0592, 0x0515, 0x14E1, 0x0602, 0x048A,
			
 
				+	0x0E1D, 0x0CBD, 0x0F9F, 0x0423, 0x075E, 0x174E, 0x0426, 0x0404,
			
 
				+	0x0C22, 0x0CA2, 0x0DEE, 0x0CA5, 0x0F3F, 0x05C1, 0x0CCD, 0x0503,
			
 
				+	0x044D, 0x0D16, 0x0449, 0x0D82, 0x0613, 0x0585, 0x0519, 0x0C95,
			
 
				+	0x075F, 0x0D35, 0x04B1, 0x0509, 0x0531, 0x0DA1, 0x049E, 0x040A,
			
 
				+	0x05CF, 0x0D41, 0x0415, 0x0692, 0x05FD, 0x0C25, 0x04A1, 0x0529,
			
 
				+	0x0591, 0x0C93, 0x057F, 0x04C1, 0x0512, 0x051A, 0x078D, 0x0451,
			
 
				+	0x0C0F, 0x0487, 0x0611, 0x0432, 0x042A, 0x05AF, 0x0461, 0x072D,
			
 
				+	0x0409, 0x0405, 0x0D39, 0x05DE, 0x048E, 0x0499, 0x0483, 0x04A9,
			
 
				+	0x0491, 0x042D, 0x049D, 0x0429, 0x040E, 0x05AE, 0x0521, 0x043D,
			
 
				+	0x0581, 0x05DD, 0x0492, 0x0CAD, 0x041E, 0x058F, 0x071F, 0x072F,
			
 
				+	0x0419, 0x073D, 0x057D, 0x0511, 0x05CE, 0x041D, 0x0485, 0x056E,
			
 
				+	0x0412, 0x0431, 0x05BF, 0x0441, 0x054E, 0x0489, 0x0421, 0x0502,
			
 
				+	0x0408, 0x040D, 0x051F, 0x059F, 0x073E, 0x078F, 0x0482, 0x079D,
			
 
				+	0x0C02, 0x05BE, 0x048B, 0x0411, 0x0505, 0x057E, 0x052E, 0x074D,
			
 
				+	0x077E, 0x054F, 0x0601, 0x055F, 0x068D, 0x070D, 0x070F, 0x071E,
			
 
				+	0x072E, 0x05CD, 0x0403, 0x0501, 0x055D, 0x059E, 0x0781, 0x0413,
			
 
				+	0x0417, 0x041B, 0x0453, 0x048D, 0x052F, 0x053E, 0x053F, 0x055E,
			
 
				+	0x056F, 0x058E, 0x05BD, 0x05ED, 0x060E, 0x0622, 0x0683, 0x0702,
			
 
				+	0x070E, 0x071D, 0x075D, 0x076F, 0x078E, 0x079E, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_12x10 =
			
 
				+{
			
 
				+	12, 10,
			
 
				+	{ 491, 240 },
			
 
				+	{ 1099, 341 },
			
 
				+	{ 0, 23 },
			
 
				+	{ percentile_arr_12x10_0, percentile_arr_12x10_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 12)
			
 
				+static const uint16_t percentile_arr_12x12_0[529] {
			
 
				+	0x0334, 0xF534, 0x8514, 0x8954, 0x7F14, 0xFB54, 0x7B08, 0x7128,
			
 
				+	0x7974, 0x6179, 0x6B64, 0x6908, 0x606A, 0x6724, 0xB544, 0xB066,
			
 
				+	0xA14A, 0x5118, 0x9975, 0x51F9, 0x981C, 0x49CA, 0x4854, 0x886F,
			
 
				+	0x88D4, 0x48EE, 0x41E2, 0x4163, 0x40F3, 0x4261, 0x4064, 0x407E,
			
 
				+	0x385A, 0x42C1, 0x4172, 0x38EA, 0x3946, 0x78CF, 0xA056, 0x38DE,
			
 
				+	0x3D08, 0x38F9, 0x3B14, 0x38FE, 0xA134, 0x38B8, 0x31A4, 0x71D2,
			
 
				+	0x60DA, 0x39C3, 0x99BA, 0x60CA, 0x39F2, 0x30F5, 0x304F, 0x31B6,
			
 
				+	0x31F5, 0x3204, 0x3148, 0x305F, 0x2953, 0x3194, 0x3184, 0x310C,
			
 
				+	0x889C, 0x300C, 0x2943, 0x30EF, 0x28C6, 0x2997, 0x2838, 0x58E6,
			
 
				+	0x20E4, 0x28E3, 0x2873, 0x29E3, 0x2A84, 0x28D3, 0x492B, 0x2962,
			
 
				+	0x286E, 0x20BF, 0x21AA, 0x29A6, 0x6A14, 0x2828, 0x89C6, 0x21B3,
			
 
				+	0x2305, 0x29B4, 0x2173, 0x2127, 0x20D6, 0x407F, 0x2294, 0x21D9,
			
 
				+	0x21D5, 0x2004, 0x404B, 0x18DF, 0x2079, 0x219B, 0x18A8, 0x2385,
			
 
				+	0x1936, 0x21AB, 0x188C, 0x1B09, 0x18BA, 0x203B, 0x187A, 0x1875,
			
 
				+	0x2344, 0x18BB, 0x18B6, 0x193A, 0x1837, 0x1914, 0x1846, 0x1876,
			
 
				+	0x1884, 0x1D24, 0x182B, 0x284A, 0x18A7, 0x18AB, 0x1917, 0x322D,
			
 
				+	0x1047, 0x1874, 0x1818, 0x18F2, 0x1164, 0x1B89, 0x2959, 0x1B21,
			
 
				+	0x39E5, 0x1827, 0x10F4, 0x18B7, 0x11D3, 0x1A4D, 0x1315, 0x12AD,
			
 
				+	0x1AD1, 0x3A71, 0x1319, 0x11A7, 0x2044, 0x2F04, 0x2341, 0x10E5,
			
 
				+	0x1155, 0x195A, 0x1024, 0x111B, 0x1251, 0x1233, 0x12E1, 0x13A1,
			
 
				+	0x13BF, 0x212A, 0x22A2, 0x113B, 0x23DF, 0x10D5, 0x2399, 0x0814,
			
 
				+	0x1126, 0x13EE, 0x1285, 0x10C4, 0x18FD, 0x20D9, 0x0987, 0x1242,
			
 
				+	0x29C5, 0x2313, 0x0898, 0x13C1, 0x08C8, 0x11F1, 0x1034, 0x1B24,
			
 
				+	0x0B0A, 0x11E9, 0x0808, 0x125D, 0x18E9, 0x0848, 0x1395, 0x0965,
			
 
				+	0x123D, 0x2186, 0x1295, 0x18CE, 0x098B, 0x0BEF, 0x1504, 0x082C,
			
 
				+	0x0A41, 0x1144, 0x0A89, 0x0956, 0x1331, 0x085E, 0x0B04, 0x128A,
			
 
				+	0x12A3, 0x1937, 0x19C2, 0x0952, 0x0872, 0x08B4, 0x1262, 0x1124,
			
 
				+	0x1969, 0x1063, 0x0AF1, 0x1225, 0x0894, 0x11C9, 0x18D2, 0x0ACD,
			
 
				+	0x0A29, 0x0B06, 0x09B5, 0x18C7, 0x0916, 0x1088, 0x09FF, 0x2206,
			
 
				+	0x0A15, 0x08B3, 0x0B51, 0x0A1F, 0x18CB, 0x0AC2, 0x0A2E, 0x1865,
			
 
				+	0x08AC, 0x0A31, 0x08A4, 0x138A, 0x0A99, 0x09D1, 0x0A86, 0x189B,
			
 
				+	0x0283, 0x0BDD, 0x0ABD, 0x1933, 0x083F, 0x1386, 0x0923, 0x0322,
			
 
				+	0x0869, 0x10DD, 0x13B1, 0x082F, 0x087D, 0x11B9, 0x085B, 0x08ED,
			
 
				+	0x00C3, 0x08E2, 0x084E, 0x0887, 0x0855, 0x0A0A, 0x0857, 0x0B92,
			
 
				+	0x1036, 0x12A5, 0x0293, 0x0945, 0x08A6, 0x0196, 0x19A3, 0x036F,
			
 
				+	0x0904, 0x1205, 0x09E1, 0x0381, 0x0971, 0x1219, 0x0BAF, 0x0949,
			
 
				+	0x00AF, 0x0AA9, 0x018A, 0x0907, 0x0BFD, 0x003A, 0x0BCD, 0x0AB2,
			
 
				+	0x088B, 0x0252, 0x0A4E, 0x03FF, 0x0845, 0x0897, 0x0059, 0x090B,
			
 
				+	0x0B42, 0x0807, 0x0A16, 0x0853, 0x0A8D, 0x01B2, 0x0AB1, 0x091A,
			
 
				+	0x0195, 0x0A35, 0x00B5, 0x10AA, 0x0115, 0x0A21, 0x0096, 0x0A08,
			
 
				+	0x03FE, 0x0B7F, 0x08B9, 0x12B3, 0x023E, 0x0A23, 0x029E, 0x08F1,
			
 
				+	0x01A9, 0x0BDE, 0x0843, 0x02D2, 0x0A1A, 0x08C5, 0x0151, 0x0A43,
			
 
				+	0x0332, 0x0383, 0x0826, 0x0BED, 0x10C2, 0x00AE, 0x0B82, 0x0213,
			
 
				+	0x0232, 0x085D, 0x02A1, 0x101B, 0x035F, 0x0303, 0x0A39, 0x0207,
			
 
				+	0x0A53, 0x0142, 0x01A5, 0x082A, 0x0099, 0x0A17, 0x03CF, 0x0906,
			
 
				+	0x0125, 0x0A96, 0x0A9A, 0x0209, 0x0393, 0x0961, 0x0131, 0x0A88,
			
 
				+	0x0139, 0x099A, 0x0292, 0x0272, 0x0862, 0x08BE, 0x0141, 0x02C3,
			
 
				+	0x0886, 0x0039, 0x08A9, 0x01A2, 0x01B1, 0x0851, 0x020B, 0x086D,
			
 
				+	0x0312, 0x08CD, 0x020F, 0x0311, 0x0BCE, 0x0135, 0x0006, 0x0849,
			
 
				+	0x0132, 0x0A8F, 0x022F, 0x022A, 0x0AAE, 0x0A8E, 0x0263, 0x03A2,
			
 
				+	0x083E, 0x009A, 0x021B, 0x0835, 0x0323, 0x0871, 0x0993, 0x0226,
			
 
				+	0x0302, 0x0922, 0x0119, 0x0222, 0x021D, 0x0B07, 0x08C9, 0x037E,
			
 
				+	0x08BD, 0x0042, 0x00D1, 0x0B33, 0x01C1, 0x0B9A, 0x0282, 0x088A,
			
 
				+	0x0182, 0x083D, 0x004D, 0x010A, 0x0A1E, 0x0019, 0x00B2, 0x0999,
			
 
				+	0x00A5, 0x0095, 0x0817, 0x0022, 0x031A, 0x0902, 0x00A3, 0x01BF,
			
 
				+	0x029F, 0x0816, 0x03B2, 0x0015, 0x0391, 0x0BBE, 0x01FE, 0x1129,
			
 
				+	0x002E, 0x01DF, 0x0301, 0x0033, 0x0B6E, 0x00E1, 0x0297, 0x00B1,
			
 
				+	0x009F, 0x0B16, 0x000A, 0x001A, 0x0052, 0x080B, 0x030B, 0x029D,
			
 
				+	0x0BAE, 0x01FD, 0x020E, 0x00A2, 0x0A3F, 0x0192, 0x0ABE, 0x020D,
			
 
				+	0x008F, 0x028B, 0x0083, 0x0025, 0x09EE, 0x01EF, 0x0029, 0x0291,
			
 
				+	0x0B4F, 0x0396, 0x0287, 0x008E, 0x0092, 0x0B4E, 0x017E, 0x001E,
			
 
				+	0x009E, 0x0103, 0x080F, 0x000E, 0x0113, 0x0203, 0x01CF, 0x0183,
			
 
				+	0x01CE, 0x001F, 0x0112, 0x01DE, 0x038E, 0x0832, 0x033E, 0x0212,
			
 
				+	0x029B, 0x0023, 0x016F, 0x0201, 0x09AF, 0x0202, 0x0281, 0x035E,
			
 
				+	0x034D, 0x037D, 0x03AD, 0x0013, 0x0093, 0x015F, 0x0211, 0x033F,
			
 
				+	0x036D, 0x039F, 0x03BD, 0x017F, 0x032E, 0x032F, 0x035D, 0x038F,
			
 
				+	0x039E
			
 
				+};
			
 
				+
			
 
				+static const uint16_t percentile_arr_12x12_1[246] {
			
 
				+	0x0443, 0xFFCD, 0x2C62, 0x2E21, 0x3CF1, 0x34C2, 0x4CDD, 0x2452,
			
 
				+	0xD5DF, 0x1DD1, 0x0FAE, 0x64A3, 0x0C7D, 0x3433, 0x1CD2, 0x2DEF,
			
 
				+	0x0C3E, 0x1D71, 0xA472, 0x0D32, 0x54B3, 0x4D51, 0x445D, 0x0E31,
			
 
				+	0x1FDD, 0x0DFF, 0x0CAE, 0x45A2, 0x2FBE, 0xA4B9, 0x1C4E, 0x2C9F,
			
 
				+	0x160D, 0x0D42, 0x342E, 0x074F, 0x1414, 0x0F6E, 0x0CB2, 0x34B5,
			
 
				+	0x0DFE, 0x0D86, 0x1496, 0x1D22, 0x0691, 0x140B, 0x041F, 0x0C35,
			
 
				+	0x1D93, 0x1506, 0x1439, 0x0C9A, 0x0F01, 0x2442, 0x0C8F, 0x04D1,
			
 
				+	0x1486, 0x0C6D, 0x0513, 0x0C71, 0x0E82, 0x177D, 0x0E03, 0x07BD,
			
 
				+	0x0C2F, 0x0D83, 0x07AF, 0x0D61, 0x1407, 0x0DB1, 0x050A, 0x0C94,
			
 
				+	0x07AD, 0x0D8A, 0x0C04, 0x0416, 0x0C49, 0x0445, 0x15C1, 0x0C1A,
			
 
				+	0x0525, 0x0595, 0x0C8A, 0x075E, 0x0CBD, 0x0681, 0x0F4E, 0x075F,
			
 
				+	0x061D, 0x1541, 0x0CB1, 0x0F3F, 0x0406, 0x076D, 0x0DCF, 0x05EE,
			
 
				+	0x0D23, 0x0599, 0x0CCD, 0x0711, 0x0C23, 0x079F, 0x0D15, 0x0585,
			
 
				+	0x04A2, 0x042A, 0x0D31, 0x05BF, 0x0D92, 0x0C26, 0x043D, 0x0C93,
			
 
				+	0x0502, 0x0C15, 0x048B, 0x0D03, 0x0613, 0x0516, 0x0495, 0x0C29,
			
 
				+	0x04A5, 0x040F, 0x0425, 0x0539, 0x0D19, 0x04E1, 0x05BE, 0x0422,
			
 
				+	0x0432, 0x0C0A, 0x0431, 0x041E, 0x0492, 0x04A9, 0x0582, 0x0529,
			
 
				+	0x0487, 0x0C4D, 0x0512, 0x049E, 0x0505, 0x0451, 0x0D7F, 0x0489,
			
 
				+	0x0602, 0x05DE, 0x0591, 0x0535, 0x074D, 0x055E, 0x04C1, 0x0612,
			
 
				+	0x05DD, 0x05FD, 0x0C61, 0x0521, 0x0484, 0x05CE, 0x0581, 0x0491,
			
 
				+	0x051A, 0x04A1, 0x048E, 0x040D, 0x0499, 0x071F, 0x072E, 0x075D,
			
 
				+	0x0441, 0x0589, 0x057E, 0x0CAD, 0x0501, 0x054F, 0x0692, 0x0511,
			
 
				+	0x049D, 0x0509, 0x056E, 0x040E, 0x0409, 0x0601, 0x048D, 0x0413,
			
 
				+	0x053E, 0x0419, 0x072D, 0x0408, 0x0485, 0x042D, 0x041D, 0x05A1,
			
 
				+	0x0781, 0x0402, 0x05ED, 0x0C82, 0x0403, 0x057D, 0x05CD, 0x0611,
			
 
				+	0x0488, 0x0411, 0x054E, 0x051F, 0x053F, 0x056F, 0x059F, 0x070F,
			
 
				+	0x071D, 0x073D, 0x073E, 0x077E, 0x078F, 0x0405, 0x079D, 0x079E,
			
 
				+	0x058E, 0x0412, 0x055D, 0x05AE, 0x041B, 0x0421, 0x0453, 0x0417,
			
 
				+	0x0483, 0x052E, 0x052F, 0x055F, 0x058F, 0x059E, 0x05AF, 0x05BD,
			
 
				+	0x060E, 0x0622, 0x0683, 0x068D, 0x0702, 0x070D, 0x070E, 0x071E,
			
 
				+	0x072F, 0x076F, 0x078D, 0x078E, 0x07BF, 0x07CE
			
 
				+};
			
 
				+
			
 
				+static const packed_percentile_table block_pcd_12x12 {
			
 
				+	12, 12,
			
 
				+	{ 529, 246 },
			
 
				+	{ 1435, 335 },
			
 
				+	{ 0, 22 },
			
 
				+	{ percentile_arr_12x12_0, percentile_arr_12x12_1 }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * @brief Fetch the packed percentile table for the given 2D block size.
			
 
				+ *
			
 
				+ * @param xdim The block x size.
			
 
				+ * @param ydim The block y size.
			
 
				+ *
			
 
				+ * @return The packed table.
			
 
				+ */
			
 
				+static const packed_percentile_table *get_packed_table(
			
 
				+	int xdim,
			
 
				+	int ydim
			
 
				+) {
			
 
				+	int idx = (ydim << 8) | xdim;
			
 
				+	switch (idx)
			
 
				+	{
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (4 * 4)
			
 
				+		case 0x0404: return &block_pcd_4x4;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 4)
			
 
				+		case 0x0405: return &block_pcd_5x4;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 5)
			
 
				+		case 0x0505: return &block_pcd_5x5;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 5)
			
 
				+		case 0x0506: return &block_pcd_6x5;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 6)
			
 
				+		case 0x0606: return &block_pcd_6x6;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 5)
			
 
				+		case 0x0508: return &block_pcd_8x5;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 6)
			
 
				+		case 0x0608: return &block_pcd_8x6;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 8)
			
 
				+		case 0x0808: return &block_pcd_8x8;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 5)
			
 
				+		case 0x050A: return &block_pcd_10x5;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 6)
			
 
				+		case 0x060A: return &block_pcd_10x6;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 8)
			
 
				+		case 0x080A: return &block_pcd_10x8;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 10)
			
 
				+		case 0x0A0A: return &block_pcd_10x10;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 10)
			
 
				+		case 0x0A0C: return &block_pcd_12x10;
			
 
				+#endif
			
 
				+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 12)
			
 
				+		case 0x0C0C: return &block_pcd_12x12;
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	// Should never hit this with a valid 2D block size
			
 
				+	return nullptr;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+const float *get_2d_percentile_table(
			
 
				+	unsigned int xdim,
			
 
				+	unsigned int ydim
			
 
				+) {
			
 
				+	float* unpacked_table = new float[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+	const packed_percentile_table *apt = get_packed_table(xdim, ydim);
			
 
				+
			
 
				+	// Set the default percentile
			
 
				+	for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
			
 
				+	{
			
 
				+		unpacked_table[i] = 1.0f;
			
 
				+	}
			
 
				+
			
 
				+	// Populate the unpacked percentile values
			
 
				+	for (int i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		unsigned int itemcount = apt->item_count[i];
			
 
				+		unsigned int difscale = apt->difscales[i];
			
 
				+		unsigned int accum = apt->initial_percs[i];
			
 
				+		const uint16_t *item_ptr = apt->items[i];
			
 
				+
			
 
				+		for (unsigned int j = 0; j < itemcount; j++)
			
 
				+		{
			
 
				+			uint16_t item = item_ptr[j];
			
 
				+			unsigned int idx = item & 0x7FF;
			
 
				+			unsigned int weight = (item >> 11) & 0x1F;
			
 
				+			accum += weight;
			
 
				+			unpacked_table[idx] = static_cast<float>(accum) / static_cast<float>(difscale);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return unpacked_table;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+bool is_legal_2d_block_size(
			
 
				+	unsigned int xdim,
			
 
				+	unsigned int ydim
			
 
				+) {
			
 
				+	unsigned int idx = (xdim << 8) | ydim;
			
 
				+	switch (idx)
			
 
				+	{
			
 
				+		case 0x0404:
			
 
				+		case 0x0504:
			
 
				+		case 0x0505:
			
 
				+		case 0x0605:
			
 
				+		case 0x0606:
			
 
				+		case 0x0805:
			
 
				+		case 0x0806:
			
 
				+		case 0x0808:
			
 
				+		case 0x0A05:
			
 
				+		case 0x0A06:
			
 
				+		case 0x0A08:
			
 
				+		case 0x0A0A:
			
 
				+		case 0x0C0A:
			
 
				+		case 0x0C0C:
			
 
				+			return true;
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+bool is_legal_3d_block_size(
			
 
				+	unsigned int xdim,
			
 
				+	unsigned int ydim,
			
 
				+	unsigned int zdim
			
 
				+) {
			
 
				+	unsigned int idx = (xdim << 16) | (ydim << 8) | zdim;
			
 
				+	switch (idx)
			
 
				+	{
			
 
				+		case 0x030303:
			
 
				+		case 0x040303:
			
 
				+		case 0x040403:
			
 
				+		case 0x040404:
			
 
				+		case 0x050404:
			
 
				+		case 0x050504:
			
 
				+		case 0x050505:
			
 
				+		case 0x060505:
			
 
				+		case 0x060605:
			
 
				+		case 0x060606:
			
 
				+			return true;
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
+++ b/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
@@ -0,0 +1,1350 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for finding best endpoint format.
			
 
				+ *
			
 
				+ * We assume there are two independent sources of error in any given partition:
			
 
				+ *
			
 
				+ *   - Encoding choice errors
			
 
				+ *   - Quantization errors
			
 
				+ *
			
 
				+ * Encoding choice errors are caused by encoder decisions. For example:
			
 
				+ *
			
 
				+ *   - Using luminance instead of separate RGB components.
			
 
				+ *   - Using a constant 1.0 alpha instead of storing an alpha component.
			
 
				+ *   - Using RGB+scale instead of storing two full RGB endpoints.
			
 
				+ *
			
 
				+ * Quantization errors occur due to the limited precision we use for storage. These errors generally
			
 
				+ * scale with quantization level, but are not actually independent of color encoding. In particular:
			
 
				+ *
			
 
				+ *   - If we can use offset encoding then quantization error is halved.
			
 
				+ *   - If we can use blue-contraction then quantization error for RG is halved.
			
 
				+ *   - If we use HDR endpoints the quantization error is higher.
			
 
				+ *
			
 
				+ * Apart from these effects, we assume the error is proportional to the quantization step size.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+#include "astcenc_vecmathlib.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the errors of the endpoint line options for one partition.
			
 
				+ *
			
 
				+ * Uncorrelated data assumes storing completely independent RGBA channels for each endpoint. Same
			
 
				+ * chroma data assumes storing RGBA endpoints which pass though the origin (LDR only). RGBL data
			
 
				+ * assumes storing RGB + lumashift (HDR only). Luminance error assumes storing RGB channels as a
			
 
				+ * single value.
			
 
				+ *
			
 
				+ *
			
 
				+ * @param      pi                The partition info data.
			
 
				+ * @param      partition_index   The partition index to compule the error for.
			
 
				+ * @param      blk               The image block.
			
 
				+ * @param      uncor_pline       The endpoint line assuming uncorrelated endpoints.
			
 
				+ * @param[out] uncor_err         The computed error for the uncorrelated endpoint line.
			
 
				+ * @param      samec_pline       The endpoint line assuming the same chroma for both endpoints.
			
 
				+ * @param[out] samec_err         The computed error for the uncorrelated endpoint line.
			
 
				+ * @param      rgbl_pline        The endpoint line assuming RGB + lumashift data.
			
 
				+ * @param[out] rgbl_err          The computed error for the RGB + lumashift endpoint line.
			
 
				+ * @param      l_pline           The endpoint line assuming luminance data.
			
 
				+ * @param[out] l_err             The computed error for the luminance endpoint line.
			
 
				+ * @param[out] a_drop_err        The computed error for dropping the alpha component.
			
 
				+ */
			
 
				+static void compute_error_squared_rgb_single_partition(
			
 
				+	const partition_info& pi,
			
 
				+	int partition_index,
			
 
				+	const image_block& blk,
			
 
				+	const processed_line3& uncor_pline,
			
 
				+	float& uncor_err,
			
 
				+	const processed_line3& samec_pline,
			
 
				+	float& samec_err,
			
 
				+	const processed_line3& rgbl_pline,
			
 
				+	float& rgbl_err,
			
 
				+	const processed_line3& l_pline,
			
 
				+	float& l_err,
			
 
				+	float& a_drop_err
			
 
				+) {
			
 
				+	vfloat4 ews = blk.channel_weight;
			
 
				+
			
 
				+	unsigned int texel_count = pi.partition_texel_count[partition_index];
			
 
				+	const uint8_t* texel_indexes = pi.texels_of_partition[partition_index];
			
 
				+	promise(texel_count > 0);
			
 
				+
			
 
				+	vfloatacc a_drop_errv = vfloatacc::zero();
			
 
				+	vfloat default_a(blk.get_default_alpha());
			
 
				+
			
 
				+	vfloatacc uncor_errv = vfloatacc::zero();
			
 
				+	vfloat uncor_bs0(uncor_pline.bs.lane<0>());
			
 
				+	vfloat uncor_bs1(uncor_pline.bs.lane<1>());
			
 
				+	vfloat uncor_bs2(uncor_pline.bs.lane<2>());
			
 
				+
			
 
				+	vfloat uncor_amod0(uncor_pline.amod.lane<0>());
			
 
				+	vfloat uncor_amod1(uncor_pline.amod.lane<1>());
			
 
				+	vfloat uncor_amod2(uncor_pline.amod.lane<2>());
			
 
				+
			
 
				+	vfloatacc samec_errv = vfloatacc::zero();
			
 
				+	vfloat samec_bs0(samec_pline.bs.lane<0>());
			
 
				+	vfloat samec_bs1(samec_pline.bs.lane<1>());
			
 
				+	vfloat samec_bs2(samec_pline.bs.lane<2>());
			
 
				+
			
 
				+	vfloatacc rgbl_errv = vfloatacc::zero();
			
 
				+	vfloat rgbl_bs0(rgbl_pline.bs.lane<0>());
			
 
				+	vfloat rgbl_bs1(rgbl_pline.bs.lane<1>());
			
 
				+	vfloat rgbl_bs2(rgbl_pline.bs.lane<2>());
			
 
				+
			
 
				+	vfloat rgbl_amod0(rgbl_pline.amod.lane<0>());
			
 
				+	vfloat rgbl_amod1(rgbl_pline.amod.lane<1>());
			
 
				+	vfloat rgbl_amod2(rgbl_pline.amod.lane<2>());
			
 
				+
			
 
				+	vfloatacc l_errv = vfloatacc::zero();
			
 
				+	vfloat l_bs0(l_pline.bs.lane<0>());
			
 
				+	vfloat l_bs1(l_pline.bs.lane<1>());
			
 
				+	vfloat l_bs2(l_pline.bs.lane<2>());
			
 
				+
			
 
				+	vint lane_ids = vint::lane_id();
			
 
				+	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		vint tix(texel_indexes + i);
			
 
				+
			
 
				+		vmask mask = lane_ids < vint(texel_count);
			
 
				+		lane_ids += vint(ASTCENC_SIMD_WIDTH);
			
 
				+
			
 
				+		// Compute the error that arises from just ditching alpha
			
 
				+		vfloat data_a = gatherf(blk.data_a, tix);
			
 
				+		vfloat alpha_diff = data_a - default_a;
			
 
				+		alpha_diff = alpha_diff * alpha_diff;
			
 
				+
			
 
				+		haccumulate(a_drop_errv, alpha_diff, mask);
			
 
				+
			
 
				+		vfloat data_r = gatherf(blk.data_r, tix);
			
 
				+		vfloat data_g = gatherf(blk.data_g, tix);
			
 
				+		vfloat data_b = gatherf(blk.data_b, tix);
			
 
				+
			
 
				+		// Compute uncorrelated error
			
 
				+		vfloat param = data_r * uncor_bs0
			
 
				+		             + data_g * uncor_bs1
			
 
				+		             + data_b * uncor_bs2;
			
 
				+
			
 
				+		vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r;
			
 
				+		vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g;
			
 
				+		vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b;
			
 
				+
			
 
				+		vfloat error = dist0 * dist0 * ews.lane<0>()
			
 
				+		             + dist1 * dist1 * ews.lane<1>()
			
 
				+		             + dist2 * dist2 * ews.lane<2>();
			
 
				+
			
 
				+		haccumulate(uncor_errv, error, mask);
			
 
				+
			
 
				+		// Compute same chroma error - no "amod", its always zero
			
 
				+		param = data_r * samec_bs0
			
 
				+		      + data_g * samec_bs1
			
 
				+		      + data_b * samec_bs2;
			
 
				+
			
 
				+		dist0 = (param * samec_bs0) - data_r;
			
 
				+		dist1 = (param * samec_bs1) - data_g;
			
 
				+		dist2 = (param * samec_bs2) - data_b;
			
 
				+
			
 
				+		error = dist0 * dist0 * ews.lane<0>()
			
 
				+		      + dist1 * dist1 * ews.lane<1>()
			
 
				+		      + dist2 * dist2 * ews.lane<2>();
			
 
				+
			
 
				+		haccumulate(samec_errv, error, mask);
			
 
				+
			
 
				+		// Compute rgbl error
			
 
				+		param = data_r * rgbl_bs0
			
 
				+		      + data_g * rgbl_bs1
			
 
				+		      + data_b * rgbl_bs2;
			
 
				+
			
 
				+		dist0 = (rgbl_amod0 + param * rgbl_bs0) - data_r;
			
 
				+		dist1 = (rgbl_amod1 + param * rgbl_bs1) - data_g;
			
 
				+		dist2 = (rgbl_amod2 + param * rgbl_bs2) - data_b;
			
 
				+
			
 
				+		error = dist0 * dist0 * ews.lane<0>()
			
 
				+		      + dist1 * dist1 * ews.lane<1>()
			
 
				+		      + dist2 * dist2 * ews.lane<2>();
			
 
				+
			
 
				+		haccumulate(rgbl_errv, error, mask);
			
 
				+
			
 
				+		// Compute luma error - no "amod", its always zero
			
 
				+		param = data_r * l_bs0
			
 
				+		      + data_g * l_bs1
			
 
				+		      + data_b * l_bs2;
			
 
				+
			
 
				+		dist0 = (param * l_bs0) - data_r;
			
 
				+		dist1 = (param * l_bs1) - data_g;
			
 
				+		dist2 = (param * l_bs2) - data_b;
			
 
				+
			
 
				+		error = dist0 * dist0 * ews.lane<0>()
			
 
				+		      + dist1 * dist1 * ews.lane<1>()
			
 
				+		      + dist2 * dist2 * ews.lane<2>();
			
 
				+
			
 
				+		haccumulate(l_errv, error, mask);
			
 
				+	}
			
 
				+
			
 
				+	a_drop_err = hadd_s(a_drop_errv) * ews.lane<3>();
			
 
				+	uncor_err = hadd_s(uncor_errv);
			
 
				+	samec_err = hadd_s(samec_errv);
			
 
				+	rgbl_err = hadd_s(rgbl_errv);
			
 
				+	l_err = hadd_s(l_errv);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For a given set of input colors and partitioning determine endpoint encode errors.
			
 
				+ *
			
 
				+ * This function determines the color error that results from RGB-scale encoding (LDR only),
			
 
				+ * RGB-lumashift encoding (HDR only), luminance-encoding, and alpha drop. Also determines whether
			
 
				+ * the endpoints are eligible for offset encoding or blue-contraction
			
 
				+ *
			
 
				+ * @param      blk   The image block.
			
 
				+ * @param      pi    The partition info data.
			
 
				+ * @param      ep    The idealized endpoints.
			
 
				+ * @param[out] eci   The resulting encoding choice error metrics.
			
 
				+  */
			
 
				+static void compute_encoding_choice_errors(
			
 
				+	const image_block& blk,
			
 
				+	const partition_info& pi,
			
 
				+	const endpoints& ep,
			
 
				+	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS])
			
 
				+{
			
 
				+	int partition_count = pi.partition_count;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
			
 
				+
			
 
				+	compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
			
 
				+
			
 
				+	for (int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		partition_metrics& pm = pms[i];
			
 
				+
			
 
				+		line3 uncor_rgb_lines;
			
 
				+		line3 samec_rgb_lines;  // for LDR-RGB-scale
			
 
				+		line3 rgb_luma_lines;   // for HDR-RGB-scale
			
 
				+
			
 
				+		processed_line3 uncor_rgb_plines;
			
 
				+		processed_line3 samec_rgb_plines;
			
 
				+		processed_line3 rgb_luma_plines;
			
 
				+		processed_line3 luminance_plines;
			
 
				+
			
 
				+		float uncorr_rgb_error;
			
 
				+		float samechroma_rgb_error;
			
 
				+		float rgb_luma_error;
			
 
				+		float luminance_rgb_error;
			
 
				+		float alpha_drop_error;
			
 
				+
			
 
				+		uncor_rgb_lines.a = pm.avg;
			
 
				+		uncor_rgb_lines.b = normalize_safe(pm.dir, unit3());
			
 
				+
			
 
				+		samec_rgb_lines.a = vfloat4::zero();
			
 
				+		samec_rgb_lines.b = normalize_safe(pm.avg, unit3());
			
 
				+
			
 
				+		rgb_luma_lines.a = pm.avg;
			
 
				+		rgb_luma_lines.b = unit3();
			
 
				+
			
 
				+		uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b);
			
 
				+		uncor_rgb_plines.bs   = uncor_rgb_lines.b;
			
 
				+
			
 
				+		// Same chroma always goes though zero, so this is simpler than the others
			
 
				+		samec_rgb_plines.amod = vfloat4::zero();
			
 
				+		samec_rgb_plines.bs   = samec_rgb_lines.b;
			
 
				+
			
 
				+		rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b);
			
 
				+		rgb_luma_plines.bs   = rgb_luma_lines.b;
			
 
				+
			
 
				+		// Luminance always goes though zero, so this is simpler than the others
			
 
				+		luminance_plines.amod = vfloat4::zero();
			
 
				+		luminance_plines.bs   = unit3();
			
 
				+
			
 
				+		compute_error_squared_rgb_single_partition(
			
 
				+		    pi, i, blk,
			
 
				+		    uncor_rgb_plines, uncorr_rgb_error,
			
 
				+		    samec_rgb_plines, samechroma_rgb_error,
			
 
				+		    rgb_luma_plines,  rgb_luma_error,
			
 
				+		    luminance_plines, luminance_rgb_error,
			
 
				+		                      alpha_drop_error);
			
 
				+
			
 
				+		// Determine if we can offset encode RGB lanes
			
 
				+		vfloat4 endpt0 = ep.endpt0[i];
			
 
				+		vfloat4 endpt1 = ep.endpt1[i];
			
 
				+		vfloat4 endpt_diff = abs(endpt1 - endpt0);
			
 
				+		vmask4 endpt_can_offset = endpt_diff < vfloat4(0.12f * 65535.0f);
			
 
				+		bool can_offset_encode = (mask(endpt_can_offset) & 0x7) == 0x7;
			
 
				+
			
 
				+		// Store out the settings
			
 
				+		eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f;  // empirical
			
 
				+		eci[i].rgb_luma_error  = (rgb_luma_error - uncorr_rgb_error) * 1.5f;        // wild guess
			
 
				+		eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f;   // empirical
			
 
				+		eci[i].alpha_drop_error = alpha_drop_error * 3.0f;
			
 
				+		eci[i].can_offset_encode = can_offset_encode;
			
 
				+		eci[i].can_blue_contract = !blk.is_luminance();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For a given partition compute the error for every endpoint integer count and quant level.
			
 
				+ *
			
 
				+ * @param      encode_hdr_rgb     @c true if using HDR for RGB, @c false for LDR.
			
 
				+ * @param      encode_hdr_alpha   @c true if using HDR for alpha, @c false for LDR.
			
 
				+ * @param      partition_index    The partition index.
			
 
				+ * @param      pi                 The partition info.
			
 
				+ * @param      eci                The encoding choice error metrics.
			
 
				+ * @param      ep                 The idealized endpoints.
			
 
				+ * @param      error_weight       The resulting encoding choice error metrics.
			
 
				+ * @param[out] best_error         The best error for each integer count and quant level.
			
 
				+ * @param[out] format_of_choice   The preferred endpoint format for each integer count and quant level.
			
 
				+ */
			
 
				+static void compute_color_error_for_every_integer_count_and_quant_level(
			
 
				+	bool encode_hdr_rgb,
			
 
				+	bool encode_hdr_alpha,
			
 
				+	int partition_index,
			
 
				+	const partition_info& pi,
			
 
				+	const encoding_choice_errors& eci,
			
 
				+	const endpoints& ep,
			
 
				+	vfloat4 error_weight,
			
 
				+	float best_error[21][4],
			
 
				+	uint8_t format_of_choice[21][4]
			
 
				+) {
			
 
				+	int partition_size = pi.partition_texel_count[partition_index];
			
 
				+
			
 
				+	static const float baseline_quant_error[21 - QUANT_6] {
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (5 * 5),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (7 * 7),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (9 * 9),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (11 * 11),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (15 * 15),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (19 * 19),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (23 * 23),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (31 * 31),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (39 * 39),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (47 * 47),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (63 * 63),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (79 * 79),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (95 * 95),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (127 * 127),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (159 * 159),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (191 * 191),
			
 
				+		(65536.0f * 65536.0f / 18.0f) / (255 * 255)
			
 
				+	};
			
 
				+
			
 
				+	vfloat4 ep0 = ep.endpt0[partition_index];
			
 
				+	vfloat4 ep1 = ep.endpt1[partition_index];
			
 
				+
			
 
				+	float ep1_min = hmin_rgb_s(ep1);
			
 
				+	ep1_min = astc::max(ep1_min, 0.0f);
			
 
				+
			
 
				+	float error_weight_rgbsum = hadd_rgb_s(error_weight);
			
 
				+
			
 
				+	float range_upper_limit_rgb = encode_hdr_rgb ? 61440.0f : 65535.0f;
			
 
				+	float range_upper_limit_alpha = encode_hdr_alpha ? 61440.0f : 65535.0f;
			
 
				+
			
 
				+	// It is possible to get endpoint colors significantly outside [0,upper-limit] even if the
			
 
				+	// input data are safely contained in [0,upper-limit]; we need to add an error term for this
			
 
				+	vfloat4 offset(range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_alpha);
			
 
				+	vfloat4 ep0_range_error_high = max(ep0 - offset, 0.0f);
			
 
				+	vfloat4 ep1_range_error_high = max(ep1 - offset, 0.0f);
			
 
				+
			
 
				+	vfloat4 ep0_range_error_low = min(ep0, 0.0f);
			
 
				+	vfloat4 ep1_range_error_low = min(ep1, 0.0f);
			
 
				+
			
 
				+	vfloat4 sum_range_error =
			
 
				+		(ep0_range_error_low * ep0_range_error_low) +
			
 
				+		(ep1_range_error_low * ep1_range_error_low) +
			
 
				+		(ep0_range_error_high * ep0_range_error_high) +
			
 
				+		(ep1_range_error_high * ep1_range_error_high);
			
 
				+
			
 
				+	float rgb_range_error = dot3_s(sum_range_error, error_weight)
			
 
				+	                      * 0.5f * static_cast<float>(partition_size);
			
 
				+	float alpha_range_error = sum_range_error.lane<3>() * error_weight.lane<3>()
			
 
				+	                        * 0.5f * static_cast<float>(partition_size);
			
 
				+
			
 
				+	if (encode_hdr_rgb)
			
 
				+	{
			
 
				+
			
 
				+		// Collect some statistics
			
 
				+		float af, cf;
			
 
				+		if (ep1.lane<0>() > ep1.lane<1>() && ep1.lane<0>() > ep1.lane<2>())
			
 
				+		{
			
 
				+			af = ep1.lane<0>();
			
 
				+			cf = ep1.lane<0>() - ep0.lane<0>();
			
 
				+		}
			
 
				+		else if (ep1.lane<1>() > ep1.lane<2>())
			
 
				+		{
			
 
				+			af = ep1.lane<1>();
			
 
				+			cf = ep1.lane<1>() - ep0.lane<1>();
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			af = ep1.lane<2>();
			
 
				+			cf = ep1.lane<2>() - ep0.lane<2>();
			
 
				+		}
			
 
				+
			
 
				+		// Estimate of color-component spread in high endpoint color
			
 
				+		float bf = af - ep1_min;
			
 
				+		vfloat4 prd = (ep1 - vfloat4(cf)).swz<0, 1, 2>();
			
 
				+		vfloat4 pdif = prd - ep0.swz<0, 1, 2>();
			
 
				+		// Estimate of color-component spread in low endpoint color
			
 
				+		float df = hmax_s(abs(pdif));
			
 
				+
			
 
				+		int b = static_cast<int>(bf);
			
 
				+		int c = static_cast<int>(cf);
			
 
				+		int d = static_cast<int>(df);
			
 
				+
			
 
				+		// Determine which one of the 6 submodes is likely to be used in case of an RGBO-mode
			
 
				+		int rgbo_mode = 5;		// 7 bits per component
			
 
				+		// mode 4: 8 7 6
			
 
				+		if (b < 32768 && c < 16384)
			
 
				+		{
			
 
				+			rgbo_mode = 4;
			
 
				+		}
			
 
				+
			
 
				+		// mode 3: 9 6 7
			
 
				+		if (b < 8192 && c < 16384)
			
 
				+		{
			
 
				+			rgbo_mode = 3;
			
 
				+		}
			
 
				+
			
 
				+		// mode 2: 10 5 8
			
 
				+		if (b < 2048 && c < 16384)
			
 
				+		{
			
 
				+			rgbo_mode = 2;
			
 
				+		}
			
 
				+
			
 
				+		// mode 1: 11 6 5
			
 
				+		if (b < 2048 && c < 1024)
			
 
				+		{
			
 
				+			rgbo_mode = 1;
			
 
				+		}
			
 
				+
			
 
				+		// mode 0: 11 5 7
			
 
				+		if (b < 1024 && c < 4096)
			
 
				+		{
			
 
				+			rgbo_mode = 0;
			
 
				+		}
			
 
				+
			
 
				+		// Determine which one of the 9 submodes is likely to be used in case of an RGB-mode.
			
 
				+		int rgb_mode = 8;		// 8 bits per component, except 7 bits for blue
			
 
				+
			
 
				+		// mode 0: 9 7 6 7
			
 
				+		if (b < 16384 && c < 8192 && d < 8192)
			
 
				+		{
			
 
				+			rgb_mode = 0;
			
 
				+		}
			
 
				+
			
 
				+		// mode 1: 9 8 6 6
			
 
				+		if (b < 32768 && c < 8192 && d < 4096)
			
 
				+		{
			
 
				+			rgb_mode = 1;
			
 
				+		}
			
 
				+
			
 
				+		// mode 2: 10 6 7 7
			
 
				+		if (b < 4096 && c < 8192 && d < 4096)
			
 
				+		{
			
 
				+			rgb_mode = 2;
			
 
				+		}
			
 
				+
			
 
				+		// mode 3: 10 7 7 6
			
 
				+		if (b < 8192 && c < 8192 && d < 2048)
			
 
				+		{
			
 
				+			rgb_mode = 3;
			
 
				+		}
			
 
				+
			
 
				+		// mode 4: 11 8 6 5
			
 
				+		if (b < 8192 && c < 2048 && d < 512)
			
 
				+		{
			
 
				+			rgb_mode = 4;
			
 
				+		}
			
 
				+
			
 
				+		// mode 5: 11 6 8 6
			
 
				+		if (b < 2048 && c < 8192 && d < 1024)
			
 
				+		{
			
 
				+			rgb_mode = 5;
			
 
				+		}
			
 
				+
			
 
				+		// mode 6: 12 7 7 5
			
 
				+		if (b < 2048 && c < 2048 && d < 256)
			
 
				+		{
			
 
				+			rgb_mode = 6;
			
 
				+		}
			
 
				+
			
 
				+		// mode 7: 12 6 7 6
			
 
				+		if (b < 1024 && c < 2048 && d < 512)
			
 
				+		{
			
 
				+			rgb_mode = 7;
			
 
				+		}
			
 
				+
			
 
				+		static const float rgbo_error_scales[6] { 4.0f, 4.0f, 16.0f, 64.0f, 256.0f, 1024.0f };
			
 
				+		static const float rgb_error_scales[9] { 64.0f, 64.0f, 16.0f, 16.0f, 4.0f, 4.0f, 1.0f, 1.0f, 384.0f };
			
 
				+
			
 
				+		float mode7mult = rgbo_error_scales[rgbo_mode] * 0.0015f;  // Empirically determined ....
			
 
				+		float mode11mult = rgb_error_scales[rgb_mode] * 0.010f;    // Empirically determined ....
			
 
				+
			
 
				+
			
 
				+		float lum_high = hadd_rgb_s(ep1) * (1.0f / 3.0f);
			
 
				+		float lum_low = hadd_rgb_s(ep0) * (1.0f / 3.0f);
			
 
				+		float lumdif = lum_high - lum_low;
			
 
				+		float mode23mult = lumdif < 960 ? 4.0f : lumdif < 3968 ? 16.0f : 128.0f;
			
 
				+
			
 
				+		mode23mult *= 0.0005f;  // Empirically determined ....
			
 
				+
			
 
				+		// Pick among the available HDR endpoint modes
			
 
				+		for (int i = QUANT_2; i < QUANT_16; i++)
			
 
				+		{
			
 
				+			best_error[i][3] = ERROR_CALC_DEFAULT;
			
 
				+			best_error[i][2] = ERROR_CALC_DEFAULT;
			
 
				+			best_error[i][1] = ERROR_CALC_DEFAULT;
			
 
				+			best_error[i][0] = ERROR_CALC_DEFAULT;
			
 
				+
			
 
				+			format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
			
 
				+			format_of_choice[i][2] = FMT_HDR_RGB;
			
 
				+			format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
			
 
				+			format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
			
 
				+		}
			
 
				+
			
 
				+		for (int i = QUANT_16; i <= QUANT_256; i++)
			
 
				+		{
			
 
				+			// The base_quant_error should depend on the scale-factor that would be used during
			
 
				+			// actual encode of the color value
			
 
				+
			
 
				+			float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size);
			
 
				+			float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
			
 
				+			float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f;
			
 
				+			float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
			
 
				+
			
 
				+			// For 8 integers, we have two encodings: one with HDR A and another one with LDR A
			
 
				+
			
 
				+			float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
			
 
				+			best_error[i][3] = full_hdr_rgba_error;
			
 
				+			format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
			
 
				+
			
 
				+			// For 6 integers, we have one HDR-RGB encoding
			
 
				+			float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error;
			
 
				+			best_error[i][2] = full_hdr_rgb_error;
			
 
				+			format_of_choice[i][2] = FMT_HDR_RGB;
			
 
				+
			
 
				+			// For 4 integers, we have one HDR-RGB-Scale encoding
			
 
				+			float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci.alpha_drop_error + eci.rgb_luma_error;
			
 
				+
			
 
				+			best_error[i][1] = hdr_rgb_scale_error;
			
 
				+			format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
			
 
				+
			
 
				+			// For 2 integers, we assume luminance-with-large-range
			
 
				+			float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci.alpha_drop_error + eci.luminance_error;
			
 
				+			best_error[i][0] = hdr_luminance_error;
			
 
				+			format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (int i = QUANT_2; i < QUANT_6; i++)
			
 
				+		{
			
 
				+			best_error[i][3] = ERROR_CALC_DEFAULT;
			
 
				+			best_error[i][2] = ERROR_CALC_DEFAULT;
			
 
				+			best_error[i][1] = ERROR_CALC_DEFAULT;
			
 
				+			best_error[i][0] = ERROR_CALC_DEFAULT;
			
 
				+
			
 
				+			format_of_choice[i][3] = FMT_RGBA;
			
 
				+			format_of_choice[i][2] = FMT_RGB;
			
 
				+			format_of_choice[i][1] = FMT_RGB_SCALE;
			
 
				+			format_of_choice[i][0] = FMT_LUMINANCE;
			
 
				+		}
			
 
				+
			
 
				+		float base_quant_error_rgb = error_weight_rgbsum * static_cast<float>(partition_size);
			
 
				+		float base_quant_error_a = error_weight.lane<3>() * static_cast<float>(partition_size);
			
 
				+		float base_quant_error_rgba = base_quant_error_rgb + base_quant_error_a;
			
 
				+
			
 
				+		float error_scale_bc_rgba = eci.can_blue_contract ? 0.625f : 1.0f;
			
 
				+		float error_scale_oe_rgba = eci.can_offset_encode ? 0.5f : 1.0f;
			
 
				+
			
 
				+		float error_scale_bc_rgb = eci.can_blue_contract ? 0.5f : 1.0f;
			
 
				+		float error_scale_oe_rgb = eci.can_offset_encode ? 0.25f : 1.0f;
			
 
				+
			
 
				+		// Pick among the available LDR endpoint modes
			
 
				+		for (int i = QUANT_6; i <= QUANT_256; i++)
			
 
				+		{
			
 
				+			// Offset encoding not possible at higher quant levels
			
 
				+			if (i >= QUANT_192)
			
 
				+			{
			
 
				+				error_scale_oe_rgba = 1.0f;
			
 
				+				error_scale_oe_rgb = 1.0f;
			
 
				+			}
			
 
				+
			
 
				+			float base_quant_error = baseline_quant_error[i - QUANT_6];
			
 
				+			float quant_error_rgb  = base_quant_error_rgb * base_quant_error;
			
 
				+			float quant_error_rgba = base_quant_error_rgba * base_quant_error;
			
 
				+
			
 
				+			// 8 integers can encode as RGBA+RGBA
			
 
				+			float full_ldr_rgba_error = quant_error_rgba
			
 
				+			                          * error_scale_bc_rgba
			
 
				+			                          * error_scale_oe_rgba
			
 
				+			                          + rgb_range_error
			
 
				+			                          + alpha_range_error;
			
 
				+
			
 
				+			best_error[i][3] = full_ldr_rgba_error;
			
 
				+			format_of_choice[i][3] = FMT_RGBA;
			
 
				+
			
 
				+			// 6 integers can encode as RGB+RGB or RGBS+AA
			
 
				+			float full_ldr_rgb_error = quant_error_rgb
			
 
				+			                         * error_scale_bc_rgb
			
 
				+			                         * error_scale_oe_rgb
			
 
				+			                         + rgb_range_error
			
 
				+			                         + eci.alpha_drop_error;
			
 
				+
			
 
				+			float rgbs_alpha_error = quant_error_rgba
			
 
				+			                       + eci.rgb_scale_error
			
 
				+			                       + rgb_range_error
			
 
				+			                       + alpha_range_error;
			
 
				+
			
 
				+			if (rgbs_alpha_error < full_ldr_rgb_error)
			
 
				+			{
			
 
				+				best_error[i][2] = rgbs_alpha_error;
			
 
				+				format_of_choice[i][2] = FMT_RGB_SCALE_ALPHA;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				best_error[i][2] = full_ldr_rgb_error;
			
 
				+				format_of_choice[i][2] = FMT_RGB;
			
 
				+			}
			
 
				+
			
 
				+			// 4 integers can encode as RGBS or LA+LA
			
 
				+			float ldr_rgbs_error = quant_error_rgb
			
 
				+			                     + rgb_range_error
			
 
				+			                     + eci.alpha_drop_error
			
 
				+			                     + eci.rgb_scale_error;
			
 
				+
			
 
				+			float lum_alpha_error = quant_error_rgba
			
 
				+			                      + rgb_range_error
			
 
				+			                      + alpha_range_error
			
 
				+			                      + eci.luminance_error;
			
 
				+
			
 
				+			if (ldr_rgbs_error < lum_alpha_error)
			
 
				+			{
			
 
				+				best_error[i][1] = ldr_rgbs_error;
			
 
				+				format_of_choice[i][1] = FMT_RGB_SCALE;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				best_error[i][1] = lum_alpha_error;
			
 
				+				format_of_choice[i][1] = FMT_LUMINANCE_ALPHA;
			
 
				+			}
			
 
				+
			
 
				+			// 2 integers can encode as L+L
			
 
				+			float luminance_error = quant_error_rgb
			
 
				+			                      + rgb_range_error
			
 
				+			                      + eci.alpha_drop_error
			
 
				+			                      + eci.luminance_error;
			
 
				+
			
 
				+			best_error[i][0] = luminance_error;
			
 
				+			format_of_choice[i][0] = FMT_LUMINANCE;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For one partition compute the best format and quantization for a given bit count.
			
 
				+ *
			
 
				+ * @param      best_combined_error    The best error for each quant level and integer count.
			
 
				+ * @param      best_combined_format   The best format for each quant level and integer count.
			
 
				+ * @param      bits_available         The number of bits available for encoding.
			
 
				+ * @param[out] best_quant_level       The output best color quant level.
			
 
				+ * @param[out] best_format            The output best color format.
			
 
				+ *
			
 
				+ * @return The output error for the best pairing.
			
 
				+ */
			
 
				+static float one_partition_find_best_combination_for_bitcount(
			
 
				+	const float best_combined_error[21][4],
			
 
				+	const uint8_t best_combined_format[21][4],
			
 
				+	int bits_available,
			
 
				+	uint8_t& best_quant_level,
			
 
				+	uint8_t& best_format
			
 
				+) {
			
 
				+	int best_integer_count = 0;
			
 
				+	float best_integer_count_error = ERROR_CALC_DEFAULT;
			
 
				+
			
 
				+	for (int integer_count = 1; integer_count <= 4;  integer_count++)
			
 
				+	{
			
 
				+		// Compute the quantization level for a given number of integers and a given number of bits
			
 
				+		int quant_level = quant_mode_table[integer_count][bits_available];
			
 
				+
			
 
				+		// Don't have enough bits to represent a given endpoint format at all!
			
 
				+		if (quant_level < QUANT_6)
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		float integer_count_error = best_combined_error[quant_level][integer_count - 1];
			
 
				+		if (integer_count_error < best_integer_count_error)
			
 
				+		{
			
 
				+			best_integer_count_error = integer_count_error;
			
 
				+			best_integer_count = integer_count - 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int ql = quant_mode_table[best_integer_count + 1][bits_available];
			
 
				+
			
 
				+	best_quant_level = static_cast<uint8_t>(ql);
			
 
				+	best_format = FMT_LUMINANCE;
			
 
				+
			
 
				+	if (ql >= QUANT_6)
			
 
				+	{
			
 
				+		best_format = best_combined_format[ql][best_integer_count];
			
 
				+	}
			
 
				+
			
 
				+	return best_integer_count_error;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For 2 partitions compute the best format combinations for every pair of quant mode and integer count.
			
 
				+ *
			
 
				+ * @param      best_error             The best error for a single endpoint quant level and integer count.
			
 
				+ * @param      best_format            The best format for a single endpoint quant level and integer count.
			
 
				+ * @param[out] best_combined_error    The best combined error pairings for the 2 partitions.
			
 
				+ * @param[out] best_combined_format   The best combined format pairings for the 2 partitions.
			
 
				+ */
			
 
				+static void two_partitions_find_best_combination_for_every_quantization_and_integer_count(
			
 
				+	const float best_error[2][21][4],	// indexed by (partition, quant-level, integer-pair-count-minus-1)
			
 
				+	const uint8_t best_format[2][21][4],
			
 
				+	float best_combined_error[21][7],	// indexed by (quant-level, integer-pair-count-minus-2)
			
 
				+	uint8_t best_combined_format[21][7][2]
			
 
				+) {
			
 
				+	for (int i = QUANT_2; i <= QUANT_256; i++)
			
 
				+	{
			
 
				+		for (int j = 0; j < 7; j++)
			
 
				+		{
			
 
				+			best_combined_error[i][j] = ERROR_CALC_DEFAULT;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
			
 
				+	{
			
 
				+		for (int i = 0; i < 4; i++)	// integer-count for first endpoint-pair
			
 
				+		{
			
 
				+			for (int j = 0; j < 4; j++)	// integer-count for second endpoint-pair
			
 
				+			{
			
 
				+				int low2 = astc::min(i, j);
			
 
				+				int high2 = astc::max(i, j);
			
 
				+				if ((high2 - low2) > 1)
			
 
				+				{
			
 
				+					continue;
			
 
				+				}
			
 
				+
			
 
				+				int intcnt = i + j;
			
 
				+				float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j], 1e10f);
			
 
				+				if (errorterm <= best_combined_error[quant][intcnt])
			
 
				+				{
			
 
				+					best_combined_error[quant][intcnt] = errorterm;
			
 
				+					best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
			
 
				+					best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For 2 partitions compute the best format and quantization for a given bit count.
			
 
				+ *
			
 
				+ * @param      best_combined_error    The best error for each quant level and integer count.
			
 
				+ * @param      best_combined_format   The best format for each quant level and integer count.
			
 
				+ * @param      bits_available         The number of bits available for encoding.
			
 
				+ * @param[out] best_quant_level       The output best color quant level.
			
 
				+ * @param[out] best_quant_level_mod   The output best color quant level assuming two more bits are available.
			
 
				+ * @param[out] best_formats           The output best color formats.
			
 
				+ *
			
 
				+ * @return The output error for the best pairing.
			
 
				+ */
			
 
				+static float two_partitions_find_best_combination_for_bitcount(
			
 
				+	float best_combined_error[21][7],
			
 
				+	uint8_t best_combined_format[21][7][2],
			
 
				+	int bits_available,
			
 
				+	uint8_t& best_quant_level,
			
 
				+	uint8_t& best_quant_level_mod,
			
 
				+	uint8_t* best_formats
			
 
				+) {
			
 
				+	int best_integer_count = 0;
			
 
				+	float best_integer_count_error = ERROR_CALC_DEFAULT;
			
 
				+
			
 
				+	for (int integer_count = 2; integer_count <= 8; integer_count++)
			
 
				+	{
			
 
				+		// Compute the quantization level for a given number of integers and a given number of bits
			
 
				+		int quant_level = quant_mode_table[integer_count][bits_available];
			
 
				+
			
 
				+		// Don't have enough bits to represent a given endpoint format at all!
			
 
				+		if (quant_level < QUANT_6)
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		float integer_count_error = best_combined_error[quant_level][integer_count - 2];
			
 
				+		if (integer_count_error < best_integer_count_error)
			
 
				+		{
			
 
				+			best_integer_count_error = integer_count_error;
			
 
				+			best_integer_count = integer_count;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int ql = quant_mode_table[best_integer_count][bits_available];
			
 
				+	int ql_mod = quant_mode_table[best_integer_count][bits_available + 2];
			
 
				+
			
 
				+	best_quant_level = static_cast<uint8_t>(ql);
			
 
				+	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
			
 
				+
			
 
				+	if (ql >= QUANT_6)
			
 
				+	{
			
 
				+		for (int i = 0; i < 2; i++)
			
 
				+		{
			
 
				+			best_formats[i] = best_combined_format[ql][best_integer_count - 2][i];
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (int i = 0; i < 2; i++)
			
 
				+		{
			
 
				+			best_formats[i] = FMT_LUMINANCE;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return best_integer_count_error;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For 3 partitions compute the best format combinations for every pair of quant mode and integer count.
			
 
				+ *
			
 
				+ * @param      best_error             The best error for a single endpoint quant level and integer count.
			
 
				+ * @param      best_format            The best format for a single endpoint quant level and integer count.
			
 
				+ * @param[out] best_combined_error    The best combined error pairings for the 3 partitions.
			
 
				+ * @param[out] best_combined_format   The best combined format pairings for the 3 partitions.
			
 
				+ */
			
 
				+static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(
			
 
				+	const float best_error[3][21][4],	// indexed by (partition, quant-level, integer-count)
			
 
				+	const uint8_t best_format[3][21][4],
			
 
				+	float best_combined_error[21][10],
			
 
				+	uint8_t best_combined_format[21][10][3]
			
 
				+) {
			
 
				+	for (int i = QUANT_2; i <= QUANT_256; i++)
			
 
				+	{
			
 
				+		for (int j = 0; j < 10; j++)
			
 
				+		{
			
 
				+			best_combined_error[i][j] = ERROR_CALC_DEFAULT;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
			
 
				+	{
			
 
				+		for (int i = 0; i < 4; i++)	// integer-count for first endpoint-pair
			
 
				+		{
			
 
				+			for (int j = 0; j < 4; j++)	// integer-count for second endpoint-pair
			
 
				+			{
			
 
				+				int low2 = astc::min(i, j);
			
 
				+				int high2 = astc::max(i, j);
			
 
				+				if ((high2 - low2) > 1)
			
 
				+				{
			
 
				+					continue;
			
 
				+				}
			
 
				+
			
 
				+				for (int k = 0; k < 4; k++)	// integer-count for third endpoint-pair
			
 
				+				{
			
 
				+					int low3 = astc::min(k, low2);
			
 
				+					int high3 = astc::max(k, high2);
			
 
				+					if ((high3 - low3) > 1)
			
 
				+					{
			
 
				+						continue;
			
 
				+					}
			
 
				+
			
 
				+					int intcnt = i + j + k;
			
 
				+					float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k], 1e10f);
			
 
				+					if (errorterm <= best_combined_error[quant][intcnt])
			
 
				+					{
			
 
				+						best_combined_error[quant][intcnt] = errorterm;
			
 
				+						best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
			
 
				+						best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
			
 
				+						best_combined_format[quant][intcnt][2] = best_format[2][quant][k];
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For 3 partitions compute the best format and quantization for a given bit count.
			
 
				+ *
			
 
				+ * @param      best_combined_error    The best error for each quant level and integer count.
			
 
				+ * @param      best_combined_format   The best format for each quant level and integer count.
			
 
				+ * @param      bits_available         The number of bits available for encoding.
			
 
				+ * @param[out] best_quant_level       The output best color quant level.
			
 
				+ * @param[out] best_quant_level_mod   The output best color quant level assuming two more bits are available.
			
 
				+ * @param[out] best_formats           The output best color formats.
			
 
				+ *
			
 
				+ * @return The output error for the best pairing.
			
 
				+ */
			
 
				+static float three_partitions_find_best_combination_for_bitcount(
			
 
				+	const float best_combined_error[21][10],
			
 
				+	const uint8_t best_combined_format[21][10][3],
			
 
				+	int bits_available,
			
 
				+	uint8_t& best_quant_level,
			
 
				+	uint8_t& best_quant_level_mod,
			
 
				+	uint8_t* best_formats
			
 
				+) {
			
 
				+	int best_integer_count = 0;
			
 
				+	float best_integer_count_error = ERROR_CALC_DEFAULT;
			
 
				+
			
 
				+	for (int integer_count = 3; integer_count <= 9; integer_count++)
			
 
				+	{
			
 
				+		// Compute the quantization level for a given number of integers and a given number of bits
			
 
				+		int quant_level = quant_mode_table[integer_count][bits_available];
			
 
				+
			
 
				+		// Don't have enough bits to represent a given endpoint format at all!
			
 
				+		if (quant_level < QUANT_6)
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		float integer_count_error = best_combined_error[quant_level][integer_count - 3];
			
 
				+		if (integer_count_error < best_integer_count_error)
			
 
				+		{
			
 
				+			best_integer_count_error = integer_count_error;
			
 
				+			best_integer_count = integer_count;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int ql = quant_mode_table[best_integer_count][bits_available];
			
 
				+	int ql_mod = quant_mode_table[best_integer_count][bits_available + 5];
			
 
				+
			
 
				+	best_quant_level = static_cast<uint8_t>(ql);
			
 
				+	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
			
 
				+
			
 
				+	if (ql >= QUANT_6)
			
 
				+	{
			
 
				+		for (int i = 0; i < 3; i++)
			
 
				+		{
			
 
				+			best_formats[i] = best_combined_format[ql][best_integer_count - 3][i];
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (int i = 0; i < 3; i++)
			
 
				+		{
			
 
				+			best_formats[i] = FMT_LUMINANCE;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return best_integer_count_error;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For 4 partitions compute the best format combinations for every pair of quant mode and integer count.
			
 
				+ *
			
 
				+ * @param      best_error             The best error for a single endpoint quant level and integer count.
			
 
				+ * @param      best_format            The best format for a single endpoint quant level and integer count.
			
 
				+ * @param[out] best_combined_error    The best combined error pairings for the 4 partitions.
			
 
				+ * @param[out] best_combined_format   The best combined format pairings for the 4 partitions.
			
 
				+ */
			
 
				+static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(
			
 
				+	const float best_error[4][21][4],	// indexed by (partition, quant-level, integer-count)
			
 
				+	const uint8_t best_format[4][21][4],
			
 
				+	float best_combined_error[21][13],
			
 
				+	uint8_t best_combined_format[21][13][4]
			
 
				+) {
			
 
				+	for (int i = QUANT_2; i <= QUANT_256; i++)
			
 
				+	{
			
 
				+		for (int j = 0; j < 13; j++)
			
 
				+		{
			
 
				+			best_combined_error[i][j] = ERROR_CALC_DEFAULT;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
			
 
				+	{
			
 
				+		for (int i = 0; i < 4; i++)	// integer-count for first endpoint-pair
			
 
				+		{
			
 
				+			for (int j = 0; j < 4; j++)	// integer-count for second endpoint-pair
			
 
				+			{
			
 
				+				int low2 = astc::min(i, j);
			
 
				+				int high2 = astc::max(i, j);
			
 
				+				if ((high2 - low2) > 1)
			
 
				+				{
			
 
				+					continue;
			
 
				+				}
			
 
				+
			
 
				+				for (int k = 0; k < 4; k++)	// integer-count for third endpoint-pair
			
 
				+				{
			
 
				+					int low3 = astc::min(k, low2);
			
 
				+					int high3 = astc::max(k, high2);
			
 
				+					if ((high3 - low3) > 1)
			
 
				+					{
			
 
				+						continue;
			
 
				+					}
			
 
				+
			
 
				+					for (int l = 0; l < 4; l++)	// integer-count for fourth endpoint-pair
			
 
				+					{
			
 
				+						int low4 = astc::min(l, low3);
			
 
				+						int high4 = astc::max(l, high3);
			
 
				+						if ((high4 - low4) > 1)
			
 
				+						{
			
 
				+							continue;
			
 
				+						}
			
 
				+
			
 
				+						int intcnt = i + j + k + l;
			
 
				+						float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k] + best_error[3][quant][l], 1e10f);
			
 
				+						if (errorterm <= best_combined_error[quant][intcnt])
			
 
				+						{
			
 
				+							best_combined_error[quant][intcnt] = errorterm;
			
 
				+							best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
			
 
				+							best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
			
 
				+							best_combined_format[quant][intcnt][2] = best_format[2][quant][k];
			
 
				+							best_combined_format[quant][intcnt][3] = best_format[3][quant][l];
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For 4 partitions compute the best format and quantization for a given bit count.
			
 
				+ *
			
 
				+ * @param      best_combined_error    The best error for each quant level and integer count.
			
 
				+ * @param      best_combined_format   The best format for each quant level and integer count.
			
 
				+ * @param      bits_available         The number of bits available for encoding.
			
 
				+ * @param[out] best_quant_level       The output best color quant level.
			
 
				+ * @param[out] best_quant_level_mod   The output best color quant level assuming two more bits are available.
			
 
				+ * @param[out] best_formats           The output best color formats.
			
 
				+ *
			
 
				+ * @return best_error The output error for the best pairing.
			
 
				+ */
			
 
				+static float four_partitions_find_best_combination_for_bitcount(
			
 
				+	const float best_combined_error[21][13],
			
 
				+	const uint8_t best_combined_format[21][13][4],
			
 
				+	int bits_available,
			
 
				+	uint8_t& best_quant_level,
			
 
				+	uint8_t& best_quant_level_mod,
			
 
				+	uint8_t* best_formats
			
 
				+) {
			
 
				+	int best_integer_count = 0;
			
 
				+	float best_integer_count_error = ERROR_CALC_DEFAULT;
			
 
				+
			
 
				+	for (int integer_count = 4; integer_count <= 9; integer_count++)
			
 
				+	{
			
 
				+		// Compute the quantization level for a given number of integers and a given number of bits
			
 
				+		int quant_level = quant_mode_table[integer_count][bits_available];
			
 
				+
			
 
				+		// Don't have enough bits to represent a given endpoint format at all!
			
 
				+		if (quant_level < QUANT_6)
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		float integer_count_error = best_combined_error[quant_level][integer_count - 4];
			
 
				+		if (integer_count_error < best_integer_count_error)
			
 
				+		{
			
 
				+			best_integer_count_error = integer_count_error;
			
 
				+			best_integer_count = integer_count;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int ql = quant_mode_table[best_integer_count][bits_available];
			
 
				+	int ql_mod = quant_mode_table[best_integer_count][bits_available + 8];
			
 
				+
			
 
				+	best_quant_level = static_cast<uint8_t>(ql);
			
 
				+	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
			
 
				+
			
 
				+	if (ql >= QUANT_6)
			
 
				+	{
			
 
				+		for (int i = 0; i < 4; i++)
			
 
				+		{
			
 
				+			best_formats[i] = best_combined_format[ql][best_integer_count - 4][i];
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (int i = 0; i < 4; i++)
			
 
				+		{
			
 
				+			best_formats[i] = FMT_LUMINANCE;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return best_integer_count_error;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+unsigned int compute_ideal_endpoint_formats(
			
 
				+	const partition_info& pi,
			
 
				+	const image_block& blk,
			
 
				+	const endpoints& ep,
			
 
				+	 // bitcounts and errors computed for the various quantization methods
			
 
				+	const int8_t* qwt_bitcounts,
			
 
				+	const float* qwt_errors,
			
 
				+	unsigned int tune_candidate_limit,
			
 
				+	unsigned int start_block_mode,
			
 
				+	unsigned int end_block_mode,
			
 
				+	// output data
			
 
				+	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
			
 
				+	int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
			
 
				+	quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
			
 
				+	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
			
 
				+	compression_working_buffers& tmpbuf
			
 
				+) {
			
 
				+	int partition_count = pi.partition_count;
			
 
				+
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	bool encode_hdr_rgb = static_cast<bool>(blk.rgb_lns[0]);
			
 
				+	bool encode_hdr_alpha = static_cast<bool>(blk.alpha_lns[0]);
			
 
				+
			
 
				+	// Compute the errors that result from various encoding choices (such as using luminance instead
			
 
				+	// of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
			
 
				+	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS];
			
 
				+	compute_encoding_choice_errors(blk, pi, ep, eci);
			
 
				+
			
 
				+	float best_error[BLOCK_MAX_PARTITIONS][21][4];
			
 
				+	uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
			
 
				+	for (int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		compute_color_error_for_every_integer_count_and_quant_level(
			
 
				+		    encode_hdr_rgb, encode_hdr_alpha, i,
			
 
				+		    pi, eci[i], ep, blk.channel_weight, best_error[i],
			
 
				+		    format_of_choice[i]);
			
 
				+	}
			
 
				+
			
 
				+	float* errors_of_best_combination = tmpbuf.errors_of_best_combination;
			
 
				+	uint8_t* best_quant_levels = tmpbuf.best_quant_levels;
			
 
				+	uint8_t* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
			
 
				+	uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
			
 
				+
			
 
				+	// Ensure that the first iteration understep contains data that will never be picked
			
 
				+	vfloat clear_error(ERROR_CALC_DEFAULT);
			
 
				+	vint clear_quant(0);
			
 
				+
			
 
				+	unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
			
 
				+	storea(clear_error, errors_of_best_combination + packed_start_block_mode);
			
 
				+	store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
			
 
				+	store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
			
 
				+
			
 
				+	// Ensure that last iteration overstep contains data that will never be picked
			
 
				+	unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
			
 
				+	storea(clear_error, errors_of_best_combination + packed_end_block_mode);
			
 
				+	store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
			
 
				+	store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
			
 
				+
			
 
				+	// Track a scalar best to avoid expensive search at least once ...
			
 
				+	float error_of_best_combination = ERROR_CALC_DEFAULT;
			
 
				+	int index_of_best_combination = -1;
			
 
				+
			
 
				+	// The block contains 1 partition
			
 
				+	if (partition_count == 1)
			
 
				+	{
			
 
				+		for (unsigned int i = start_block_mode; i < end_block_mode; i++)
			
 
				+		{
			
 
				+			if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
			
 
				+			{
			
 
				+				errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			float error_of_best = one_partition_find_best_combination_for_bitcount(
			
 
				+			    best_error[0], format_of_choice[0], qwt_bitcounts[i],
			
 
				+			    best_quant_levels[i], best_ep_formats[i][0]);
			
 
				+
			
 
				+			float total_error = error_of_best + qwt_errors[i];
			
 
				+			errors_of_best_combination[i] = total_error;
			
 
				+			best_quant_levels_mod[i] = best_quant_levels[i];
			
 
				+
			
 
				+			if (total_error < error_of_best_combination)
			
 
				+			{
			
 
				+				error_of_best_combination = total_error;
			
 
				+				index_of_best_combination = i;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	// The block contains 2 partitions
			
 
				+	else if (partition_count == 2)
			
 
				+	{
			
 
				+		float combined_best_error[21][7];
			
 
				+		uint8_t formats_of_choice[21][7][2];
			
 
				+
			
 
				+		two_partitions_find_best_combination_for_every_quantization_and_integer_count(
			
 
				+		    best_error, format_of_choice, combined_best_error, formats_of_choice);
			
 
				+
			
 
				+		assert(start_block_mode == 0);
			
 
				+		for (unsigned int i = 0; i < end_block_mode; i++)
			
 
				+		{
			
 
				+			if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
			
 
				+			{
			
 
				+				errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			float error_of_best = two_partitions_find_best_combination_for_bitcount(
			
 
				+			    combined_best_error, formats_of_choice, qwt_bitcounts[i],
			
 
				+			    best_quant_levels[i], best_quant_levels_mod[i],
			
 
				+			    best_ep_formats[i]);
			
 
				+
			
 
				+			float total_error = error_of_best + qwt_errors[i];
			
 
				+			errors_of_best_combination[i] = total_error;
			
 
				+
			
 
				+			if (total_error < error_of_best_combination)
			
 
				+			{
			
 
				+				error_of_best_combination = total_error;
			
 
				+				index_of_best_combination = i;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	// The block contains 3 partitions
			
 
				+	else if (partition_count == 3)
			
 
				+	{
			
 
				+		float combined_best_error[21][10];
			
 
				+		uint8_t formats_of_choice[21][10][3];
			
 
				+
			
 
				+		three_partitions_find_best_combination_for_every_quantization_and_integer_count(
			
 
				+		    best_error, format_of_choice, combined_best_error, formats_of_choice);
			
 
				+
			
 
				+		assert(start_block_mode == 0);
			
 
				+		for (unsigned int i = 0; i < end_block_mode; i++)
			
 
				+		{
			
 
				+			if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
			
 
				+			{
			
 
				+				errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			float error_of_best = three_partitions_find_best_combination_for_bitcount(
			
 
				+			    combined_best_error, formats_of_choice, qwt_bitcounts[i],
			
 
				+			    best_quant_levels[i], best_quant_levels_mod[i],
			
 
				+			    best_ep_formats[i]);
			
 
				+
			
 
				+			float total_error = error_of_best + qwt_errors[i];
			
 
				+			errors_of_best_combination[i] = total_error;
			
 
				+
			
 
				+			if (total_error < error_of_best_combination)
			
 
				+			{
			
 
				+				error_of_best_combination = total_error;
			
 
				+				index_of_best_combination = i;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	// The block contains 4 partitions
			
 
				+	else // if (partition_count == 4)
			
 
				+	{
			
 
				+		assert(partition_count == 4);
			
 
				+		float combined_best_error[21][13];
			
 
				+		uint8_t formats_of_choice[21][13][4];
			
 
				+
			
 
				+		four_partitions_find_best_combination_for_every_quantization_and_integer_count(
			
 
				+		    best_error, format_of_choice, combined_best_error, formats_of_choice);
			
 
				+
			
 
				+		assert(start_block_mode == 0);
			
 
				+		for (unsigned int i = 0; i < end_block_mode; i++)
			
 
				+		{
			
 
				+			if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
			
 
				+			{
			
 
				+				errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			float error_of_best = four_partitions_find_best_combination_for_bitcount(
			
 
				+			    combined_best_error, formats_of_choice, qwt_bitcounts[i],
			
 
				+			    best_quant_levels[i], best_quant_levels_mod[i],
			
 
				+			    best_ep_formats[i]);
			
 
				+
			
 
				+			float total_error = error_of_best + qwt_errors[i];
			
 
				+			errors_of_best_combination[i] = total_error;
			
 
				+
			
 
				+			if (total_error < error_of_best_combination)
			
 
				+			{
			
 
				+				error_of_best_combination = total_error;
			
 
				+				index_of_best_combination = i;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int best_error_weights[TUNE_MAX_TRIAL_CANDIDATES];
			
 
				+
			
 
				+	// Fast path the first result and avoid the list search for trial 0
			
 
				+	best_error_weights[0] = index_of_best_combination;
			
 
				+	if (index_of_best_combination >= 0)
			
 
				+	{
			
 
				+		errors_of_best_combination[index_of_best_combination] = ERROR_CALC_DEFAULT;
			
 
				+	}
			
 
				+
			
 
				+	// Search the remaining results and pick the best candidate modes for trial 1+
			
 
				+	for (unsigned int i = 1; i < tune_candidate_limit; i++)
			
 
				+	{
			
 
				+		vint vbest_error_index(-1);
			
 
				+		vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
			
 
				+
			
 
				+		start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
			
 
				+		vint lane_ids = vint::lane_id() + vint(start_block_mode);
			
 
				+		for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
			
 
				+		{
			
 
				+			vfloat err = vfloat(errors_of_best_combination + j);
			
 
				+			vmask mask = err < vbest_ep_error;
			
 
				+			vbest_ep_error = select(vbest_ep_error, err, mask);
			
 
				+			vbest_error_index = select(vbest_error_index, lane_ids, mask);
			
 
				+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
			
 
				+		}
			
 
				+
			
 
				+		// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
			
 
				+		vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
			
 
				+		vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
			
 
				+		vbest_error_index = hmin(vbest_error_index);
			
 
				+		int best_error_index = vbest_error_index.lane<0>();
			
 
				+
			
 
				+		best_error_weights[i] = best_error_index;
			
 
				+
			
 
				+		// Max the error for this candidate so we don't pick it again
			
 
				+		if (best_error_index >= 0)
			
 
				+		{
			
 
				+			errors_of_best_combination[best_error_index] = ERROR_CALC_DEFAULT;
			
 
				+		}
			
 
				+		// Early-out if no more candidates are valid
			
 
				+		else
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int i = 0; i < tune_candidate_limit; i++)
			
 
				+	{
			
 
				+		if (best_error_weights[i] < 0)
			
 
				+		{
			
 
				+			return i;
			
 
				+		}
			
 
				+
			
 
				+		block_mode[i] = best_error_weights[i];
			
 
				+
			
 
				+		quant_level[i] = static_cast<quant_method>(best_quant_levels[best_error_weights[i]]);
			
 
				+		quant_level_mod[i] = static_cast<quant_method>(best_quant_levels_mod[best_error_weights[i]]);
			
 
				+
			
 
				+		assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256);
			
 
				+		assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256);
			
 
				+
			
 
				+		for (int j = 0; j < partition_count; j++)
			
 
				+		{
			
 
				+			partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return tune_candidate_limit;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_platform_isa_detection.cpp
+++ b/thirdparty/astcenc/astcenc_platform_isa_detection.cpp
@@ -0,0 +1,166 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2020-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Platform-specific function implementations.
			
 
				+ *
			
 
				+ * This module contains functions for querying the host extended ISA support.
			
 
				+ */
			
 
				+
			
 
				+// Include before the defines below to pick up any auto-setup based on compiler
			
 
				+// built-in config, if not being set explicitly by the build system
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#if (ASTCENC_SSE > 0)    || (ASTCENC_AVX > 0) || \
			
 
				+    (ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
			
 
				+
			
 
				+static bool g_init { false };
			
 
				+
			
 
				+/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
			
 
				+static bool g_cpu_has_sse41 { false };
			
 
				+
			
 
				+/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
			
 
				+static bool g_cpu_has_avx2 { false };
			
 
				+
			
 
				+/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
			
 
				+static bool g_cpu_has_popcnt { false };
			
 
				+
			
 
				+/** Does this CPU support F16C? Set to -1 if not yet initialized. */
			
 
				+static bool g_cpu_has_f16c { false };
			
 
				+
			
 
				+/* ============================================================================
			
 
				+   Platform code for Visual Studio
			
 
				+============================================================================ */
			
 
				+#if !defined(__clang__) && defined(_MSC_VER)
			
 
				+#define WIN32_LEAN_AND_MEAN
			
 
				+#include <windows.h>
			
 
				+#include <intrin.h>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Detect platform CPU ISA support and update global trackers.
			
 
				+ */
			
 
				+static void detect_cpu_isa()
			
 
				+{
			
 
				+	int data[4];
			
 
				+
			
 
				+	__cpuid(data, 0);
			
 
				+	int num_id = data[0];
			
 
				+
			
 
				+	if (num_id >= 1)
			
 
				+	{
			
 
				+		__cpuidex(data, 1, 0);
			
 
				+		// SSE41 = Bank 1, ECX, bit 19
			
 
				+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
			
 
				+		// POPCNT = Bank 1, ECX, bit 23
			
 
				+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
			
 
				+		// F16C = Bank 1, ECX, bit 29
			
 
				+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
			
 
				+	}
			
 
				+
			
 
				+	if (num_id >= 7)
			
 
				+	{
			
 
				+		__cpuidex(data, 7, 0);
			
 
				+		// AVX2 = Bank 7, EBX, bit 5
			
 
				+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
			
 
				+	}
			
 
				+
			
 
				+	// Ensure state bits are updated before init flag is updated
			
 
				+	MemoryBarrier();
			
 
				+	g_init = true;
			
 
				+}
			
 
				+
			
 
				+/* ============================================================================
			
 
				+   Platform code for GCC and Clang
			
 
				+============================================================================ */
			
 
				+#else
			
 
				+#include <cpuid.h>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Detect platform CPU ISA support and update global trackers.
			
 
				+ */
			
 
				+static void detect_cpu_isa()
			
 
				+{
			
 
				+	unsigned int data[4];
			
 
				+
			
 
				+	if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
			
 
				+	{
			
 
				+		// SSE41 = Bank 1, ECX, bit 19
			
 
				+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
			
 
				+		// POPCNT = Bank 1, ECX, bit 23
			
 
				+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
			
 
				+		// F16C = Bank 1, ECX, bit 29
			
 
				+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
			
 
				+	}
			
 
				+
			
 
				+	g_cpu_has_avx2 = 0;
			
 
				+	if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
			
 
				+	{
			
 
				+		// AVX2 = Bank 7, EBX, bit 5
			
 
				+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
			
 
				+	}
			
 
				+
			
 
				+	// Ensure state bits are updated before init flag is updated
			
 
				+	__sync_synchronize();
			
 
				+	g_init = true;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+bool cpu_supports_popcnt()
			
 
				+{
			
 
				+	if (!g_init)
			
 
				+	{
			
 
				+		detect_cpu_isa();
			
 
				+	}
			
 
				+
			
 
				+	return g_cpu_has_popcnt;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+bool cpu_supports_f16c()
			
 
				+{
			
 
				+	if (!g_init)
			
 
				+	{
			
 
				+		detect_cpu_isa();
			
 
				+	}
			
 
				+
			
 
				+	return g_cpu_has_f16c;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+bool cpu_supports_sse41()
			
 
				+{
			
 
				+	if (!g_init)
			
 
				+	{
			
 
				+		detect_cpu_isa();
			
 
				+	}
			
 
				+
			
 
				+	return g_cpu_has_sse41;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+bool cpu_supports_avx2()
			
 
				+{
			
 
				+	if (!g_init)
			
 
				+	{
			
 
				+		detect_cpu_isa();
			
 
				+	}
			
 
				+
			
 
				+	return g_cpu_has_avx2;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_quantization.cpp
+++ b/thirdparty/astcenc/astcenc_quantization.cpp
@@ -0,0 +1,904 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions and data tables for numeric quantization..
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+// Starts from QUANT_6
			
 
				+// Not scrambled
			
 
				+const uint8_t color_unquant_to_uquant_tables[17][256] {
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  51,  51,  51,  51,  51,  51,
			
 
				+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
			
 
				+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
			
 
				+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51, 102, 102, 102,
			
 
				+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
			
 
				+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
			
 
				+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
			
 
				+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
			
 
				+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
			
 
				+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
			
 
				+		153, 153, 153, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
			
 
				+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
			
 
				+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
			
 
				+		204, 204, 204, 204, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
			
 
				+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
			
 
				+		  0,   0,   0,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
			
 
				+		 36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
			
 
				+		 36,  36,  36,  36,  36,  36,  36,  73,  73,  73,  73,  73,  73,  73,  73,  73,
			
 
				+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,
			
 
				+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73, 109, 109, 109, 109,
			
 
				+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
			
 
				+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
			
 
				+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
			
 
				+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
			
 
				+		146, 146, 146, 146, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
			
 
				+		182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
			
 
				+		182, 182, 182, 182, 182, 182, 182, 182, 182, 219, 219, 219, 219, 219, 219, 219,
			
 
				+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
			
 
				+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 255, 255, 255,
			
 
				+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  28,
			
 
				+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
			
 
				+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  56,  56,  56,  56,  56,
			
 
				+		 56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,
			
 
				+		 56,  56,  56,  56,  56,  56,  56,  84,  84,  84,  84,  84,  84,  84,  84,  84,
			
 
				+		 84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,
			
 
				+		 84,  84,  84, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
			
 
				+		113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
			
 
				+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
			
 
				+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 171, 171, 171,
			
 
				+		171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171,
			
 
				+		171, 171, 171, 171, 171, 171, 171, 171, 171, 199, 199, 199, 199, 199, 199, 199,
			
 
				+		199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
			
 
				+		199, 199, 199, 199, 199, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
			
 
				+		227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
			
 
				+		227, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  23,  23,  23,  23,
			
 
				+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,
			
 
				+		 23,  23,  23,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,
			
 
				+		 46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  69,  69,  69,  69,  69,  69,
			
 
				+		 69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
			
 
				+		 69,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,
			
 
				+		 92,  92,  92,  92,  92,  92,  92,  92,  92, 116, 116, 116, 116, 116, 116, 116,
			
 
				+		116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
			
 
				+		139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139,
			
 
				+		139, 139, 139, 139, 139, 139, 139, 163, 163, 163, 163, 163, 163, 163, 163, 163,
			
 
				+		163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 186,
			
 
				+		186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
			
 
				+		186, 186, 186, 186, 186, 186, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209,
			
 
				+		209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 232, 232, 232,
			
 
				+		232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
			
 
				+		232, 232, 232, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,  17,  17,  17,  17,  17,  17,  17,
			
 
				+		 17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  34,  34,  34,  34,  34,  34,
			
 
				+		 34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  51,  51,  51,  51,  51,
			
 
				+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  68,  68,  68,  68,
			
 
				+		 68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  85,  85,  85,
			
 
				+		 85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85, 102, 102,
			
 
				+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119,
			
 
				+		119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
			
 
				+		136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
			
 
				+		136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
			
 
				+		153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
			
 
				+		170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
			
 
				+		187, 187, 187, 187, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
			
 
				+		204, 204, 204, 204, 204, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
			
 
				+		221, 221, 221, 221, 221, 221, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238,
			
 
				+		238, 238, 238, 238, 238, 238, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,  13,  13,  13,  13,  13,  13,  13,  13,  13,
			
 
				+		 13,  13,  13,  13,  13,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,
			
 
				+		 27,  27,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,
			
 
				+		 54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  67,  67,  67,
			
 
				+		 67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  80,  80,  80,  80,  80,  80,
			
 
				+		 80,  80,  80,  80,  80,  80,  80,  80,  94,  94,  94,  94,  94,  94,  94,  94,
			
 
				+		 94,  94,  94,  94,  94, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
			
 
				+		107, 107, 107, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
			
 
				+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 148, 148, 148,
			
 
				+		148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 161, 161, 161, 161, 161,
			
 
				+		161, 161, 161, 161, 161, 161, 161, 161, 175, 175, 175, 175, 175, 175, 175, 175,
			
 
				+		175, 175, 175, 175, 175, 175, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188,
			
 
				+		188, 188, 188, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201,
			
 
				+		215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 228, 228,
			
 
				+		228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 242, 242, 242, 242, 242,
			
 
				+		242, 242, 242, 242, 242, 242, 242, 242, 242, 255, 255, 255, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
			
 
				+		 11,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  33,  33,  33,  33,
			
 
				+		 33,  33,  33,  33,  33,  33,  33,  44,  44,  44,  44,  44,  44,  44,  44,  44,
			
 
				+		 44,  44,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  66,  66,  66,
			
 
				+		 66,  66,  66,  66,  66,  66,  66,  66,  77,  77,  77,  77,  77,  77,  77,  77,
			
 
				+		 77,  77,  77,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  99,  99,
			
 
				+		 99,  99,  99,  99,  99,  99,  99,  99,  99, 110, 110, 110, 110, 110, 110, 110,
			
 
				+		110, 110, 110, 110, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
			
 
				+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 145, 145, 145, 145,
			
 
				+		145, 145, 145, 145, 145, 145, 145, 156, 156, 156, 156, 156, 156, 156, 156, 156,
			
 
				+		156, 156, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 178, 178, 178,
			
 
				+		178, 178, 178, 178, 178, 178, 178, 178, 189, 189, 189, 189, 189, 189, 189, 189,
			
 
				+		189, 189, 189, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 211, 211,
			
 
				+		211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 222, 222, 222, 222, 222, 222,
			
 
				+		222, 222, 222, 222, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 244,
			
 
				+		244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 255, 255, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,  16,  16,  16,
			
 
				+		 16,  16,  16,  16,  16,  24,  24,  24,  24,  24,  24,  24,  24,  33,  33,  33,
			
 
				+		 33,  33,  33,  33,  33,  33,  41,  41,  41,  41,  41,  41,  41,  41,  49,  49,
			
 
				+		 49,  49,  49,  49,  49,  49,  57,  57,  57,  57,  57,  57,  57,  57,  66,  66,
			
 
				+		 66,  66,  66,  66,  66,  66,  66,  74,  74,  74,  74,  74,  74,  74,  74,  82,
			
 
				+		 82,  82,  82,  82,  82,  82,  82,  90,  90,  90,  90,  90,  90,  90,  90,  99,
			
 
				+		 99,  99,  99,  99,  99,  99,  99,  99, 107, 107, 107, 107, 107, 107, 107, 107,
			
 
				+		115, 115, 115, 115, 115, 115, 115, 115, 123, 123, 123, 123, 123, 123, 123, 123,
			
 
				+		132, 132, 132, 132, 132, 132, 132, 132, 140, 140, 140, 140, 140, 140, 140, 140,
			
 
				+		148, 148, 148, 148, 148, 148, 148, 148, 156, 156, 156, 156, 156, 156, 156, 156,
			
 
				+		156, 165, 165, 165, 165, 165, 165, 165, 165, 173, 173, 173, 173, 173, 173, 173,
			
 
				+		173, 181, 181, 181, 181, 181, 181, 181, 181, 189, 189, 189, 189, 189, 189, 189,
			
 
				+		189, 189, 198, 198, 198, 198, 198, 198, 198, 198, 206, 206, 206, 206, 206, 206,
			
 
				+		206, 206, 214, 214, 214, 214, 214, 214, 214, 214, 222, 222, 222, 222, 222, 222,
			
 
				+		222, 222, 222, 231, 231, 231, 231, 231, 231, 231, 231, 239, 239, 239, 239, 239,
			
 
				+		239, 239, 239, 247, 247, 247, 247, 247, 247, 247, 247, 255, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   6,   6,   6,   6,   6,   6,  13,  13,  13,  13,  13,  13,
			
 
				+		 13,  19,  19,  19,  19,  19,  19,  26,  26,  26,  26,  26,  26,  26,  32,  32,
			
 
				+		 32,  32,  32,  32,  39,  39,  39,  39,  39,  39,  39,  45,  45,  45,  45,  45,
			
 
				+		 45,  52,  52,  52,  52,  52,  52,  52,  58,  58,  58,  58,  58,  58,  65,  65,
			
 
				+		 65,  65,  65,  65,  65,  71,  71,  71,  71,  71,  71,  78,  78,  78,  78,  78,
			
 
				+		 78,  78,  84,  84,  84,  84,  84,  84,  91,  91,  91,  91,  91,  91,  91,  97,
			
 
				+		 97,  97,  97,  97,  97, 104, 104, 104, 104, 104, 104, 104, 110, 110, 110, 110,
			
 
				+		110, 110, 117, 117, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 123, 123,
			
 
				+		132, 132, 132, 132, 132, 132, 132, 138, 138, 138, 138, 138, 138, 138, 145, 145,
			
 
				+		145, 145, 145, 145, 151, 151, 151, 151, 151, 151, 151, 158, 158, 158, 158, 158,
			
 
				+		158, 164, 164, 164, 164, 164, 164, 164, 171, 171, 171, 171, 171, 171, 177, 177,
			
 
				+		177, 177, 177, 177, 177, 184, 184, 184, 184, 184, 184, 190, 190, 190, 190, 190,
			
 
				+		190, 190, 197, 197, 197, 197, 197, 197, 203, 203, 203, 203, 203, 203, 203, 210,
			
 
				+		210, 210, 210, 210, 210, 216, 216, 216, 216, 216, 216, 216, 223, 223, 223, 223,
			
 
				+		223, 223, 229, 229, 229, 229, 229, 229, 229, 236, 236, 236, 236, 236, 236, 242,
			
 
				+		242, 242, 242, 242, 242, 242, 249, 249, 249, 249, 249, 249, 255, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   5,   5,   5,   5,   5,   5,  11,  11,  11,  11,  11,  16,  16,
			
 
				+		 16,  16,  16,  21,  21,  21,  21,  21,  21,  27,  27,  27,  27,  27,  32,  32,
			
 
				+		 32,  32,  32,  32,  38,  38,  38,  38,  38,  43,  43,  43,  43,  43,  48,  48,
			
 
				+		 48,  48,  48,  48,  54,  54,  54,  54,  54,  59,  59,  59,  59,  59,  59,  65,
			
 
				+		 65,  65,  65,  65,  70,  70,  70,  70,  70,  70,  76,  76,  76,  76,  76,  81,
			
 
				+		 81,  81,  81,  81,  86,  86,  86,  86,  86,  86,  92,  92,  92,  92,  92,  97,
			
 
				+		 97,  97,  97,  97,  97, 103, 103, 103, 103, 103, 108, 108, 108, 108, 108, 113,
			
 
				+		113, 113, 113, 113, 113, 119, 119, 119, 119, 119, 124, 124, 124, 124, 124, 124,
			
 
				+		131, 131, 131, 131, 131, 131, 136, 136, 136, 136, 136, 142, 142, 142, 142, 142,
			
 
				+		142, 147, 147, 147, 147, 147, 152, 152, 152, 152, 152, 158, 158, 158, 158, 158,
			
 
				+		158, 163, 163, 163, 163, 163, 169, 169, 169, 169, 169, 169, 174, 174, 174, 174,
			
 
				+		174, 179, 179, 179, 179, 179, 185, 185, 185, 185, 185, 185, 190, 190, 190, 190,
			
 
				+		190, 196, 196, 196, 196, 196, 196, 201, 201, 201, 201, 201, 207, 207, 207, 207,
			
 
				+		207, 207, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 223, 223, 223, 223,
			
 
				+		223, 223, 228, 228, 228, 228, 228, 234, 234, 234, 234, 234, 234, 239, 239, 239,
			
 
				+		239, 239, 244, 244, 244, 244, 244, 250, 250, 250, 250, 250, 250, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   4,   4,   4,   4,   8,   8,   8,   8,  12,  12,  12,  12,  16,
			
 
				+		 16,  16,  16,  20,  20,  20,  20,  24,  24,  24,  24,  28,  28,  28,  28,  32,
			
 
				+		 32,  32,  32,  36,  36,  36,  36,  40,  40,  40,  40,  44,  44,  44,  44,  48,
			
 
				+		 48,  48,  48,  52,  52,  52,  52,  56,  56,  56,  56,  60,  60,  60,  60,  65,
			
 
				+		 65,  65,  65,  65,  69,  69,  69,  69,  73,  73,  73,  73,  77,  77,  77,  77,
			
 
				+		 81,  81,  81,  81,  85,  85,  85,  85,  89,  89,  89,  89,  93,  93,  93,  93,
			
 
				+		 97,  97,  97,  97, 101, 101, 101, 101, 105, 105, 105, 105, 109, 109, 109, 109,
			
 
				+		113, 113, 113, 113, 117, 117, 117, 117, 121, 121, 121, 121, 125, 125, 125, 125,
			
 
				+		130, 130, 130, 130, 134, 134, 134, 134, 138, 138, 138, 138, 142, 142, 142, 142,
			
 
				+		146, 146, 146, 146, 150, 150, 150, 150, 154, 154, 154, 154, 158, 158, 158, 158,
			
 
				+		162, 162, 162, 162, 166, 166, 166, 166, 170, 170, 170, 170, 174, 174, 174, 174,
			
 
				+		178, 178, 178, 178, 182, 182, 182, 182, 186, 186, 186, 186, 190, 190, 190, 190,
			
 
				+		190, 195, 195, 195, 195, 199, 199, 199, 199, 203, 203, 203, 203, 207, 207, 207,
			
 
				+		207, 211, 211, 211, 211, 215, 215, 215, 215, 219, 219, 219, 219, 223, 223, 223,
			
 
				+		223, 227, 227, 227, 227, 231, 231, 231, 231, 235, 235, 235, 235, 239, 239, 239,
			
 
				+		239, 243, 243, 243, 243, 247, 247, 247, 247, 251, 251, 251, 251, 255, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   3,   3,   3,   6,   6,   6,   9,   9,   9,   9,  13,  13,  13,  16,
			
 
				+		 16,  16,  19,  19,  19,  22,  22,  22,  25,  25,  25,  25,  29,  29,  29,  32,
			
 
				+		 32,  32,  35,  35,  35,  38,  38,  38,  38,  42,  42,  42,  45,  45,  45,  48,
			
 
				+		 48,  48,  51,  51,  51,  54,  54,  54,  54,  58,  58,  58,  61,  61,  61,  64,
			
 
				+		 64,  64,  67,  67,  67,  67,  71,  71,  71,  74,  74,  74,  77,  77,  77,  80,
			
 
				+		 80,  80,  83,  83,  83,  83,  87,  87,  87,  90,  90,  90,  93,  93,  93,  96,
			
 
				+		 96,  96,  96, 100, 100, 100, 103, 103, 103, 106, 106, 106, 109, 109, 109, 112,
			
 
				+		112, 112, 112, 116, 116, 116, 119, 119, 119, 122, 122, 122, 125, 125, 125, 125,
			
 
				+		130, 130, 130, 130, 133, 133, 133, 136, 136, 136, 139, 139, 139, 143, 143, 143,
			
 
				+		143, 146, 146, 146, 149, 149, 149, 152, 152, 152, 155, 155, 155, 159, 159, 159,
			
 
				+		159, 162, 162, 162, 165, 165, 165, 168, 168, 168, 172, 172, 172, 172, 175, 175,
			
 
				+		175, 178, 178, 178, 181, 181, 181, 184, 184, 184, 188, 188, 188, 188, 191, 191,
			
 
				+		191, 194, 194, 194, 197, 197, 197, 201, 201, 201, 201, 204, 204, 204, 207, 207,
			
 
				+		207, 210, 210, 210, 213, 213, 213, 217, 217, 217, 217, 220, 220, 220, 223, 223,
			
 
				+		223, 226, 226, 226, 230, 230, 230, 230, 233, 233, 233, 236, 236, 236, 239, 239,
			
 
				+		239, 242, 242, 242, 246, 246, 246, 246, 249, 249, 249, 252, 252, 252, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   2,   2,   5,   5,   5,   8,   8,   8,  10,  10,  13,  13,  13,  16,
			
 
				+		 16,  16,  18,  18,  21,  21,  21,  24,  24,  24,  26,  26,  29,  29,  29,  32,
			
 
				+		 32,  32,  35,  35,  35,  37,  37,  40,  40,  40,  43,  43,  43,  45,  45,  48,
			
 
				+		 48,  48,  51,  51,  51,  53,  53,  56,  56,  56,  59,  59,  59,  61,  61,  64,
			
 
				+		 64,  64,  67,  67,  67,  70,  70,  70,  72,  72,  75,  75,  75,  78,  78,  78,
			
 
				+		 80,  80,  83,  83,  83,  86,  86,  86,  88,  88,  91,  91,  91,  94,  94,  94,
			
 
				+		 96,  96,  99,  99,  99, 102, 102, 102, 104, 104, 107, 107, 107, 110, 110, 110,
			
 
				+		112, 112, 115, 115, 115, 118, 118, 118, 120, 120, 123, 123, 123, 126, 126, 126,
			
 
				+		129, 129, 129, 132, 132, 132, 135, 135, 137, 137, 137, 140, 140, 140, 143, 143,
			
 
				+		145, 145, 145, 148, 148, 148, 151, 151, 153, 153, 153, 156, 156, 156, 159, 159,
			
 
				+		161, 161, 161, 164, 164, 164, 167, 167, 169, 169, 169, 172, 172, 172, 175, 175,
			
 
				+		177, 177, 177, 180, 180, 180, 183, 183, 185, 185, 185, 188, 188, 188, 191, 191,
			
 
				+		191, 194, 194, 196, 196, 196, 199, 199, 199, 202, 202, 204, 204, 204, 207, 207,
			
 
				+		207, 210, 210, 212, 212, 212, 215, 215, 215, 218, 218, 220, 220, 220, 223, 223,
			
 
				+		223, 226, 226, 226, 229, 229, 231, 231, 231, 234, 234, 234, 237, 237, 239, 239,
			
 
				+		239, 242, 242, 242, 245, 245, 247, 247, 247, 250, 250, 250, 253, 253, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   2,   2,   4,   4,   6,   6,   8,   8,  10,  10,  12,  12,  14,  14,
			
 
				+		 16,  16,  18,  18,  20,  20,  22,  22,  24,  24,  26,  26,  28,  28,  30,  30,
			
 
				+		 32,  32,  34,  34,  36,  36,  38,  38,  40,  40,  42,  42,  44,  44,  46,  46,
			
 
				+		 48,  48,  50,  50,  52,  52,  54,  54,  56,  56,  58,  58,  60,  60,  62,  62,
			
 
				+		 64,  64,  66,  66,  68,  68,  70,  70,  72,  72,  74,  74,  76,  76,  78,  78,
			
 
				+		 80,  80,  82,  82,  84,  84,  86,  86,  88,  88,  90,  90,  92,  92,  94,  94,
			
 
				+		 96,  96,  98,  98, 100, 100, 102, 102, 104, 104, 106, 106, 108, 108, 110, 110,
			
 
				+		112, 112, 114, 114, 116, 116, 118, 118, 120, 120, 122, 122, 124, 124, 126, 126,
			
 
				+		129, 129, 131, 131, 133, 133, 135, 135, 137, 137, 139, 139, 141, 141, 143, 143,
			
 
				+		145, 145, 147, 147, 149, 149, 151, 151, 153, 153, 155, 155, 157, 157, 159, 159,
			
 
				+		161, 161, 163, 163, 165, 165, 167, 167, 169, 169, 171, 171, 173, 173, 175, 175,
			
 
				+		177, 177, 179, 179, 181, 181, 183, 183, 185, 185, 187, 187, 189, 189, 191, 191,
			
 
				+		193, 193, 195, 195, 197, 197, 199, 199, 201, 201, 203, 203, 205, 205, 207, 207,
			
 
				+		209, 209, 211, 211, 213, 213, 215, 215, 217, 217, 219, 219, 221, 221, 223, 223,
			
 
				+		225, 225, 227, 227, 229, 229, 231, 231, 233, 233, 235, 235, 237, 237, 239, 239,
			
 
				+		241, 241, 243, 243, 245, 245, 247, 247, 249, 249, 251, 251, 253, 253, 255, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   1,   1,   3,   4,   4,   6,   6,   8,   9,   9,  11,  12,  12,  14,  14,
			
 
				+		 16,  17,  17,  19,  20,  20,  22,  22,  24,  25,  25,  27,  28,  28,  30,  30,
			
 
				+		 32,  33,  33,  35,  36,  36,  38,  38,  40,  41,  41,  43,  44,  44,  46,  46,
			
 
				+		 48,  49,  49,  51,  52,  52,  54,  54,  56,  57,  57,  59,  60,  60,  62,  62,
			
 
				+		 64,  65,  65,  67,  68,  68,  70,  70,  72,  73,  73,  75,  76,  76,  78,  78,
			
 
				+		 80,  81,  81,  83,  84,  84,  86,  86,  88,  89,  89,  91,  92,  92,  94,  94,
			
 
				+		 96,  97,  97,  99, 100, 100, 102, 102, 104, 105, 105, 107, 108, 108, 110, 110,
			
 
				+		112, 113, 113, 115, 116, 116, 118, 118, 120, 121, 121, 123, 124, 124, 126, 126,
			
 
				+		129, 129, 131, 131, 132, 134, 134, 135, 137, 137, 139, 139, 140, 142, 142, 143,
			
 
				+		145, 145, 147, 147, 148, 150, 150, 151, 153, 153, 155, 155, 156, 158, 158, 159,
			
 
				+		161, 161, 163, 163, 164, 166, 166, 167, 169, 169, 171, 171, 172, 174, 174, 175,
			
 
				+		177, 177, 179, 179, 180, 182, 182, 183, 185, 185, 187, 187, 188, 190, 190, 191,
			
 
				+		193, 193, 195, 195, 196, 198, 198, 199, 201, 201, 203, 203, 204, 206, 206, 207,
			
 
				+		209, 209, 211, 211, 212, 214, 214, 215, 217, 217, 219, 219, 220, 222, 222, 223,
			
 
				+		225, 225, 227, 227, 228, 230, 230, 231, 233, 233, 235, 235, 236, 238, 238, 239,
			
 
				+		241, 241, 243, 243, 244, 246, 246, 247, 249, 249, 251, 251, 252, 254, 254, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   1,   2,   2,   4,   5,   6,   6,   8,   9,  10,  10,  12,  13,  14,  14,
			
 
				+		 16,  17,  18,  18,  20,  21,  22,  22,  24,  25,  26,  26,  28,  29,  30,  30,
			
 
				+		 32,  33,  34,  34,  36,  37,  38,  38,  40,  41,  42,  42,  44,  45,  46,  46,
			
 
				+		 48,  49,  50,  50,  52,  53,  54,  54,  56,  57,  58,  58,  60,  61,  62,  62,
			
 
				+		 64,  65,  66,  66,  68,  69,  70,  70,  72,  73,  74,  74,  76,  77,  78,  78,
			
 
				+		 80,  81,  82,  82,  84,  85,  86,  86,  88,  89,  90,  90,  92,  93,  94,  94,
			
 
				+		 96,  97,  98,  98, 100, 101, 102, 102, 104, 105, 106, 106, 108, 109, 110, 110,
			
 
				+		112, 113, 114, 114, 116, 117, 118, 118, 120, 121, 122, 122, 124, 125, 126, 126,
			
 
				+		129, 129, 130, 131, 133, 133, 134, 135, 137, 137, 138, 139, 141, 141, 142, 143,
			
 
				+		145, 145, 146, 147, 149, 149, 150, 151, 153, 153, 154, 155, 157, 157, 158, 159,
			
 
				+		161, 161, 162, 163, 165, 165, 166, 167, 169, 169, 170, 171, 173, 173, 174, 175,
			
 
				+		177, 177, 178, 179, 181, 181, 182, 183, 185, 185, 186, 187, 189, 189, 190, 191,
			
 
				+		193, 193, 194, 195, 197, 197, 198, 199, 201, 201, 202, 203, 205, 205, 206, 207,
			
 
				+		209, 209, 210, 211, 213, 213, 214, 215, 217, 217, 218, 219, 221, 221, 222, 223,
			
 
				+		225, 225, 226, 227, 229, 229, 230, 231, 233, 233, 234, 235, 237, 237, 238, 239,
			
 
				+		241, 241, 242, 243, 245, 245, 246, 247, 249, 249, 250, 251, 253, 253, 254, 255
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
			
 
				+		 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
			
 
				+		 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
			
 
				+		 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
			
 
				+		 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
			
 
				+		 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
			
 
				+		 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
			
 
				+		112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
			
 
				+		128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
			
 
				+		144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
			
 
				+		160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
			
 
				+		176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
			
 
				+		192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
			
 
				+		208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
			
 
				+		224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
			
 
				+		240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+// Starts from QUANT_6
			
 
				+// Scrambled
			
 
				+const uint8_t color_uquant_to_scrambled_pquant_tables[17][256] {
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
			
 
				+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
			
 
				+		  0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
			
 
				+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
			
 
				+		  1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,   6,   6,   6,   6,
			
 
				+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
			
 
				+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,
			
 
				+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   6,   6,   6,   6,   6,   6,   6,   6,   6,
			
 
				+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
			
 
				+		  6,   6,   6,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
			
 
				+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
			
 
				+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
			
 
				+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   7,   7,   7,
			
 
				+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
			
 
				+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
			
 
				+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
			
 
				+		  6,   6,   6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,
			
 
				+		 10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
			
 
				+		 11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
			
 
				+		 11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,   7,   7,
			
 
				+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
			
 
				+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,
			
 
				+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,
			
 
				+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,
			
 
				+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
			
 
				+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
			
 
				+		  8,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
			
 
				+		  9,   9,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
			
 
				+		 10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
			
 
				+		 11,  11,  11,  11,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,
			
 
				+		 12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
			
 
				+		 13,  13,  13,  13,  13,  13,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
			
 
				+		 14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,  15
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,   4,   4,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
			
 
				+		  8,   8,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,
			
 
				+		 16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   6,   6,   6,   6,   6,   6,
			
 
				+		  6,   6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,  10,
			
 
				+		 10,  10,  10,  10,  10,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
			
 
				+		 14,  14,  14,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,
			
 
				+		 19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  15,  15,  15,
			
 
				+		 15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  11,  11,  11,  11,  11,
			
 
				+		 11,  11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,   7,
			
 
				+		  7,   7,   7,   7,   7,   7,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,
			
 
				+		 13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,   9,   9,
			
 
				+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
			
 
				+		  8,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  10,  10,  10,
			
 
				+		 10,  10,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  12,  12,
			
 
				+		 12,  12,  12,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,   6,   6,
			
 
				+		  6,   6,   6,   6,   6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  14,
			
 
				+		 14,  14,  14,  14,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,
			
 
				+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  15,  15,  15,  15,
			
 
				+		 15,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,   7,   7,   7,   7,
			
 
				+		  7,   7,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  13,  13,  13,
			
 
				+		 13,  13,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,   5,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  11,  11,
			
 
				+		 11,  11,  11,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,   3,   3,   3,
			
 
				+		  3,   3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,   9,
			
 
				+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   1,   1,   1,   1,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,
			
 
				+		  2,   2,   2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,
			
 
				+		  6,   6,   6,   6,   6,   6,   7,   7,   7,   7,   7,   7,   7,   7,   8,   8,
			
 
				+		  8,   8,   8,   8,   8,   8,   8,   9,   9,   9,   9,   9,   9,   9,   9,  10,
			
 
				+		 10,  10,  10,  10,  10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  12,
			
 
				+		 12,  12,  12,  12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,
			
 
				+		 14,  14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,
			
 
				+		 16,  16,  16,  16,  16,  16,  16,  16,  17,  17,  17,  17,  17,  17,  17,  17,
			
 
				+		 18,  18,  18,  18,  18,  18,  18,  18,  19,  19,  19,  19,  19,  19,  19,  19,
			
 
				+		 19,  20,  20,  20,  20,  20,  20,  20,  20,  21,  21,  21,  21,  21,  21,  21,
			
 
				+		 21,  22,  22,  22,  22,  22,  22,  22,  22,  23,  23,  23,  23,  23,  23,  23,
			
 
				+		 23,  23,  24,  24,  24,  24,  24,  24,  24,  24,  25,  25,  25,  25,  25,  25,
			
 
				+		 25,  25,  26,  26,  26,  26,  26,  26,  26,  26,  27,  27,  27,  27,  27,  27,
			
 
				+		 27,  27,  27,  28,  28,  28,  28,  28,  28,  28,  28,  29,  29,  29,  29,  29,
			
 
				+		 29,  29,  29,  30,  30,  30,  30,  30,  30,  30,  30,  31,  31,  31,  31,  31
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   0,   8,   8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,
			
 
				+		 16,  24,  24,  24,  24,  24,  24,  32,  32,  32,  32,  32,  32,  32,   2,   2,
			
 
				+		  2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  10,  18,  18,  18,  18,  18,
			
 
				+		 18,  26,  26,  26,  26,  26,  26,  26,  34,  34,  34,  34,  34,  34,   4,   4,
			
 
				+		  4,   4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  20,  20,  20,  20,  20,
			
 
				+		 20,  20,  28,  28,  28,  28,  28,  28,  36,  36,  36,  36,  36,  36,  36,   6,
			
 
				+		  6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  14,  22,  22,  22,  22,
			
 
				+		 22,  22,  30,  30,  30,  30,  30,  30,  30,  38,  38,  38,  38,  38,  38,  38,
			
 
				+		 39,  39,  39,  39,  39,  39,  39,  31,  31,  31,  31,  31,  31,  31,  23,  23,
			
 
				+		 23,  23,  23,  23,  15,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,
			
 
				+		  7,  37,  37,  37,  37,  37,  37,  37,  29,  29,  29,  29,  29,  29,  21,  21,
			
 
				+		 21,  21,  21,  21,  21,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,   5,
			
 
				+		  5,   5,  35,  35,  35,  35,  35,  35,  27,  27,  27,  27,  27,  27,  27,  19,
			
 
				+		 19,  19,  19,  19,  19,  11,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,
			
 
				+		  3,   3,  33,  33,  33,  33,  33,  33,  33,  25,  25,  25,  25,  25,  25,  17,
			
 
				+		 17,  17,  17,  17,  17,  17,   9,   9,   9,   9,   9,   9,   1,   1,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,  16,  16,  16,  16,  16,  16,  32,  32,  32,  32,  32,   2,   2,
			
 
				+		  2,   2,   2,  18,  18,  18,  18,  18,  18,  34,  34,  34,  34,  34,   4,   4,
			
 
				+		  4,   4,   4,   4,  20,  20,  20,  20,  20,  36,  36,  36,  36,  36,   6,   6,
			
 
				+		  6,   6,   6,   6,  22,  22,  22,  22,  22,  38,  38,  38,  38,  38,  38,   8,
			
 
				+		  8,   8,   8,   8,  24,  24,  24,  24,  24,  24,  40,  40,  40,  40,  40,  10,
			
 
				+		 10,  10,  10,  10,  26,  26,  26,  26,  26,  26,  42,  42,  42,  42,  42,  12,
			
 
				+		 12,  12,  12,  12,  12,  28,  28,  28,  28,  28,  44,  44,  44,  44,  44,  14,
			
 
				+		 14,  14,  14,  14,  14,  30,  30,  30,  30,  30,  46,  46,  46,  46,  46,  46,
			
 
				+		 47,  47,  47,  47,  47,  47,  31,  31,  31,  31,  31,  15,  15,  15,  15,  15,
			
 
				+		 15,  45,  45,  45,  45,  45,  29,  29,  29,  29,  29,  13,  13,  13,  13,  13,
			
 
				+		 13,  43,  43,  43,  43,  43,  27,  27,  27,  27,  27,  27,  11,  11,  11,  11,
			
 
				+		 11,  41,  41,  41,  41,  41,  25,  25,  25,  25,  25,  25,   9,   9,   9,   9,
			
 
				+		  9,  39,  39,  39,  39,  39,  39,  23,  23,  23,  23,  23,   7,   7,   7,   7,
			
 
				+		  7,   7,  37,  37,  37,  37,  37,  21,  21,  21,  21,  21,   5,   5,   5,   5,
			
 
				+		  5,   5,  35,  35,  35,  35,  35,  19,  19,  19,  19,  19,  19,   3,   3,   3,
			
 
				+		  3,   3,  33,  33,  33,  33,  33,  17,  17,  17,  17,  17,  17,   1,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3,   4,
			
 
				+		  4,   4,   4,   5,   5,   5,   5,   6,   6,   6,   6,   7,   7,   7,   7,   8,
			
 
				+		  8,   8,   8,   9,   9,   9,   9,  10,  10,  10,  10,  11,  11,  11,  11,  12,
			
 
				+		 12,  12,  12,  13,  13,  13,  13,  14,  14,  14,  14,  15,  15,  15,  15,  16,
			
 
				+		 16,  16,  16,  16,  17,  17,  17,  17,  18,  18,  18,  18,  19,  19,  19,  19,
			
 
				+		 20,  20,  20,  20,  21,  21,  21,  21,  22,  22,  22,  22,  23,  23,  23,  23,
			
 
				+		 24,  24,  24,  24,  25,  25,  25,  25,  26,  26,  26,  26,  27,  27,  27,  27,
			
 
				+		 28,  28,  28,  28,  29,  29,  29,  29,  30,  30,  30,  30,  31,  31,  31,  31,
			
 
				+		 32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,
			
 
				+		 36,  36,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,  39,  39,  39,
			
 
				+		 40,  40,  40,  40,  41,  41,  41,  41,  42,  42,  42,  42,  43,  43,  43,  43,
			
 
				+		 44,  44,  44,  44,  45,  45,  45,  45,  46,  46,  46,  46,  47,  47,  47,  47,
			
 
				+		 47,  48,  48,  48,  48,  49,  49,  49,  49,  50,  50,  50,  50,  51,  51,  51,
			
 
				+		 51,  52,  52,  52,  52,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  55,
			
 
				+		 55,  56,  56,  56,  56,  57,  57,  57,  57,  58,  58,  58,  58,  59,  59,  59,
			
 
				+		 59,  60,  60,  60,  60,  61,  61,  61,  61,  62,  62,  62,  62,  63,  63,  63
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,  16,  16,  16,  32,  32,  32,  48,  48,  48,  48,  64,  64,  64,   2,
			
 
				+		  2,   2,  18,  18,  18,  34,  34,  34,  50,  50,  50,  50,  66,  66,  66,   4,
			
 
				+		  4,   4,  20,  20,  20,  36,  36,  36,  36,  52,  52,  52,  68,  68,  68,   6,
			
 
				+		  6,   6,  22,  22,  22,  38,  38,  38,  38,  54,  54,  54,  70,  70,  70,   8,
			
 
				+		  8,   8,  24,  24,  24,  24,  40,  40,  40,  56,  56,  56,  72,  72,  72,  10,
			
 
				+		 10,  10,  26,  26,  26,  26,  42,  42,  42,  58,  58,  58,  74,  74,  74,  12,
			
 
				+		 12,  12,  12,  28,  28,  28,  44,  44,  44,  60,  60,  60,  76,  76,  76,  14,
			
 
				+		 14,  14,  14,  30,  30,  30,  46,  46,  46,  62,  62,  62,  78,  78,  78,  78,
			
 
				+		 79,  79,  79,  79,  63,  63,  63,  47,  47,  47,  31,  31,  31,  15,  15,  15,
			
 
				+		 15,  77,  77,  77,  61,  61,  61,  45,  45,  45,  29,  29,  29,  13,  13,  13,
			
 
				+		 13,  75,  75,  75,  59,  59,  59,  43,  43,  43,  27,  27,  27,  27,  11,  11,
			
 
				+		 11,  73,  73,  73,  57,  57,  57,  41,  41,  41,  25,  25,  25,  25,   9,   9,
			
 
				+		  9,  71,  71,  71,  55,  55,  55,  39,  39,  39,  39,  23,  23,  23,   7,   7,
			
 
				+		  7,  69,  69,  69,  53,  53,  53,  37,  37,  37,  37,  21,  21,  21,   5,   5,
			
 
				+		  5,  67,  67,  67,  51,  51,  51,  51,  35,  35,  35,  19,  19,  19,   3,   3,
			
 
				+		  3,  65,  65,  65,  49,  49,  49,  49,  33,  33,  33,  17,  17,  17,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,  32,  32,  64,  64,  64,   2,   2,   2,  34,  34,  66,  66,  66,   4,
			
 
				+		  4,   4,  36,  36,  68,  68,  68,   6,   6,   6,  38,  38,  70,  70,  70,   8,
			
 
				+		  8,   8,  40,  40,  40,  72,  72,  10,  10,  10,  42,  42,  42,  74,  74,  12,
			
 
				+		 12,  12,  44,  44,  44,  76,  76,  14,  14,  14,  46,  46,  46,  78,  78,  16,
			
 
				+		 16,  16,  48,  48,  48,  80,  80,  80,  18,  18,  50,  50,  50,  82,  82,  82,
			
 
				+		 20,  20,  52,  52,  52,  84,  84,  84,  22,  22,  54,  54,  54,  86,  86,  86,
			
 
				+		 24,  24,  56,  56,  56,  88,  88,  88,  26,  26,  58,  58,  58,  90,  90,  90,
			
 
				+		 28,  28,  60,  60,  60,  92,  92,  92,  30,  30,  62,  62,  62,  94,  94,  94,
			
 
				+		 95,  95,  95,  63,  63,  63,  31,  31,  93,  93,  93,  61,  61,  61,  29,  29,
			
 
				+		 91,  91,  91,  59,  59,  59,  27,  27,  89,  89,  89,  57,  57,  57,  25,  25,
			
 
				+		 87,  87,  87,  55,  55,  55,  23,  23,  85,  85,  85,  53,  53,  53,  21,  21,
			
 
				+		 83,  83,  83,  51,  51,  51,  19,  19,  81,  81,  81,  49,  49,  49,  17,  17,
			
 
				+		 17,  79,  79,  47,  47,  47,  15,  15,  15,  77,  77,  45,  45,  45,  13,  13,
			
 
				+		 13,  75,  75,  43,  43,  43,  11,  11,  11,  73,  73,  41,  41,  41,   9,   9,
			
 
				+		  9,  71,  71,  71,  39,  39,   7,   7,   7,  69,  69,  69,  37,  37,   5,   5,
			
 
				+		  5,  67,  67,  67,  35,  35,   3,   3,   3,  65,  65,  65,  33,  33,   1,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   7,
			
 
				+		  8,   8,   9,   9,  10,  10,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,
			
 
				+		 16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,  23,
			
 
				+		 24,  24,  25,  25,  26,  26,  27,  27,  28,  28,  29,  29,  30,  30,  31,  31,
			
 
				+		 32,  32,  33,  33,  34,  34,  35,  35,  36,  36,  37,  37,  38,  38,  39,  39,
			
 
				+		 40,  40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45,  46,  46,  47,  47,
			
 
				+		 48,  48,  49,  49,  50,  50,  51,  51,  52,  52,  53,  53,  54,  54,  55,  55,
			
 
				+		 56,  56,  57,  57,  58,  58,  59,  59,  60,  60,  61,  61,  62,  62,  63,  63,
			
 
				+		 64,  64,  65,  65,  66,  66,  67,  67,  68,  68,  69,  69,  70,  70,  71,  71,
			
 
				+		 72,  72,  73,  73,  74,  74,  75,  75,  76,  76,  77,  77,  78,  78,  79,  79,
			
 
				+		 80,  80,  81,  81,  82,  82,  83,  83,  84,  84,  85,  85,  86,  86,  87,  87,
			
 
				+		 88,  88,  89,  89,  90,  90,  91,  91,  92,  92,  93,  93,  94,  94,  95,  95,
			
 
				+		 96,  96,  97,  97,  98,  98,  99,  99, 100, 100, 101, 101, 102, 102, 103, 103,
			
 
				+		104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
			
 
				+		112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
			
 
				+		120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127
			
 
				+	},
			
 
				+	{
			
 
				+		  0,  32,  32,  64,  96,  96, 128, 128,   2,  34,  34,  66,  98,  98, 130, 130,
			
 
				+		  4,  36,  36,  68, 100, 100, 132, 132,   6,  38,  38,  70, 102, 102, 134, 134,
			
 
				+		  8,  40,  40,  72, 104, 104, 136, 136,  10,  42,  42,  74, 106, 106, 138, 138,
			
 
				+		 12,  44,  44,  76, 108, 108, 140, 140,  14,  46,  46,  78, 110, 110, 142, 142,
			
 
				+		 16,  48,  48,  80, 112, 112, 144, 144,  18,  50,  50,  82, 114, 114, 146, 146,
			
 
				+		 20,  52,  52,  84, 116, 116, 148, 148,  22,  54,  54,  86, 118, 118, 150, 150,
			
 
				+		 24,  56,  56,  88, 120, 120, 152, 152,  26,  58,  58,  90, 122, 122, 154, 154,
			
 
				+		 28,  60,  60,  92, 124, 124, 156, 156,  30,  62,  62,  94, 126, 126, 158, 158,
			
 
				+		159, 159, 127, 127,  95,  63,  63,  31, 157, 157, 125, 125,  93,  61,  61,  29,
			
 
				+		155, 155, 123, 123,  91,  59,  59,  27, 153, 153, 121, 121,  89,  57,  57,  25,
			
 
				+		151, 151, 119, 119,  87,  55,  55,  23, 149, 149, 117, 117,  85,  53,  53,  21,
			
 
				+		147, 147, 115, 115,  83,  51,  51,  19, 145, 145, 113, 113,  81,  49,  49,  17,
			
 
				+		143, 143, 111, 111,  79,  47,  47,  15, 141, 141, 109, 109,  77,  45,  45,  13,
			
 
				+		139, 139, 107, 107,  75,  43,  43,  11, 137, 137, 105, 105,  73,  41,  41,   9,
			
 
				+		135, 135, 103, 103,  71,  39,  39,   7, 133, 133, 101, 101,  69,  37,  37,   5,
			
 
				+		131, 131,  99,  99,  67,  35,  35,   3, 129, 129,  97,  97,  65,  33,  33,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,  64, 128, 128,   2,  66, 130, 130,   4,  68, 132, 132,   6,  70, 134, 134,
			
 
				+		  8,  72, 136, 136,  10,  74, 138, 138,  12,  76, 140, 140,  14,  78, 142, 142,
			
 
				+		 16,  80, 144, 144,  18,  82, 146, 146,  20,  84, 148, 148,  22,  86, 150, 150,
			
 
				+		 24,  88, 152, 152,  26,  90, 154, 154,  28,  92, 156, 156,  30,  94, 158, 158,
			
 
				+		 32,  96, 160, 160,  34,  98, 162, 162,  36, 100, 164, 164,  38, 102, 166, 166,
			
 
				+		 40, 104, 168, 168,  42, 106, 170, 170,  44, 108, 172, 172,  46, 110, 174, 174,
			
 
				+		 48, 112, 176, 176,  50, 114, 178, 178,  52, 116, 180, 180,  54, 118, 182, 182,
			
 
				+		 56, 120, 184, 184,  58, 122, 186, 186,  60, 124, 188, 188,  62, 126, 190, 190,
			
 
				+		191, 191, 127,  63, 189, 189, 125,  61, 187, 187, 123,  59, 185, 185, 121,  57,
			
 
				+		183, 183, 119,  55, 181, 181, 117,  53, 179, 179, 115,  51, 177, 177, 113,  49,
			
 
				+		175, 175, 111,  47, 173, 173, 109,  45, 171, 171, 107,  43, 169, 169, 105,  41,
			
 
				+		167, 167, 103,  39, 165, 165, 101,  37, 163, 163,  99,  35, 161, 161,  97,  33,
			
 
				+		159, 159,  95,  31, 157, 157,  93,  29, 155, 155,  91,  27, 153, 153,  89,  25,
			
 
				+		151, 151,  87,  23, 149, 149,  85,  21, 147, 147,  83,  19, 145, 145,  81,  17,
			
 
				+		143, 143,  79,  15, 141, 141,  77,  13, 139, 139,  75,  11, 137, 137,  73,   9,
			
 
				+		135, 135,  71,   7, 133, 133,  69,   5, 131, 131,  67,   3, 129, 129,  65,   1
			
 
				+	},
			
 
				+	{
			
 
				+		  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
			
 
				+		 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
			
 
				+		 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
			
 
				+		 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
			
 
				+		 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
			
 
				+		 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
			
 
				+		 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
			
 
				+		112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
			
 
				+		128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
			
 
				+		144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
			
 
				+		160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
			
 
				+		176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
			
 
				+		192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
			
 
				+		208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
			
 
				+		224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
			
 
				+		240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+// Starts from QUANT_6
			
 
				+// Scrambled
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q6[6] {
			
 
				+	  0, 255,  51, 204, 102, 153
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q8[8] {
			
 
				+	  0,  36,  73, 109, 146, 182, 219, 255
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q10[10] {
			
 
				+	  0, 255,  28, 227,  56, 199,  84, 171, 113, 142
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q12[12] {
			
 
				+	  0, 255,  69, 186,  23, 232,  92, 163,  46, 209, 116, 139
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q16[16] {
			
 
				+	  0,  17,  34,  51,  68,  85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q20[20] {
			
 
				+	  0, 255,  67, 188,  13, 242,  80, 175,  27, 228,  94, 161,  40, 215, 107, 148,
			
 
				+	 54, 201, 121, 134
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q24[24] {
			
 
				+	  0, 255,  33, 222,  66, 189,  99, 156,  11, 244,  44, 211,  77, 178, 110, 145,
			
 
				+	 22, 233,  55, 200,  88, 167, 121, 134
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q32[32] {
			
 
				+	  0,   8,  16,  24,  33,  41,  49,  57,  66,  74,  82,  90,  99, 107, 115, 123,
			
 
				+	132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q40[40] {
			
 
				+	  0, 255,  32, 223,  65, 190,  97, 158,   6, 249,  39, 216,  71, 184, 104, 151,
			
 
				+	 13, 242,  45, 210,  78, 177, 110, 145,  19, 236,  52, 203,  84, 171, 117, 138,
			
 
				+	 26, 229,  58, 197,  91, 164, 123, 132
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q48[48] {
			
 
				+	  0, 255,  16, 239,  32, 223,  48, 207,  65, 190,  81, 174,  97, 158, 113, 142,
			
 
				+	  5, 250,  21, 234,  38, 217,  54, 201,  70, 185,  86, 169, 103, 152, 119, 136,
			
 
				+	 11, 244,  27, 228,  43, 212,  59, 196,  76, 179,  92, 163, 108, 147, 124, 131
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q64[64] {
			
 
				+	  0,   4,   8,  12,  16,  20,  24,  28,  32,  36,  40,  44,  48,  52,  56,  60,
			
 
				+	 65,  69,  73,  77,  81,  85,  89,  93,  97, 101, 105, 109, 113, 117, 121, 125,
			
 
				+	130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
			
 
				+	195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q80[80] {
			
 
				+	  0, 255,  16, 239,  32, 223,  48, 207,  64, 191,  80, 175,  96, 159, 112, 143,
			
 
				+	  3, 252,  19, 236,  35, 220,  51, 204,  67, 188,  83, 172, 100, 155, 116, 139,
			
 
				+	  6, 249,  22, 233,  38, 217,  54, 201,  71, 184,  87, 168, 103, 152, 119, 136,
			
 
				+	  9, 246,  25, 230,  42, 213,  58, 197,  74, 181,  90, 165, 106, 149, 122, 133,
			
 
				+	 13, 242,  29, 226,  45, 210,  61, 194,  77, 178,  93, 162, 109, 146, 125, 130
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q96[96] {
			
 
				+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
			
 
				+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
			
 
				+	  2, 253,  10, 245,  18, 237,  26, 229,  35, 220,  43, 212,  51, 204,  59, 196,
			
 
				+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
			
 
				+	  5, 250,  13, 242,  21, 234,  29, 226,  37, 218,  45, 210,  53, 202,  61, 194,
			
 
				+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q128[128] {
			
 
				+	  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
			
 
				+	 32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
			
 
				+	 64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,
			
 
				+	 96,  98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
			
 
				+	129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
			
 
				+	161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
			
 
				+	193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
			
 
				+	225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q160[160] {
			
 
				+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
			
 
				+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
			
 
				+	  1, 254,   9, 246,  17, 238,  25, 230,  33, 222,  41, 214,  49, 206,  57, 198,
			
 
				+	 65, 190,  73, 182,  81, 174,  89, 166,  97, 158, 105, 150, 113, 142, 121, 134,
			
 
				+	  3, 252,  11, 244,  19, 236,  27, 228,  35, 220,  43, 212,  51, 204,  59, 196,
			
 
				+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
			
 
				+	  4, 251,  12, 243,  20, 235,  28, 227,  36, 219,  44, 211,  52, 203,  60, 195,
			
 
				+	 68, 187,  76, 179,  84, 171,  92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
			
 
				+	  6, 249,  14, 241,  22, 233,  30, 225,  38, 217,  46, 209,  54, 201,  62, 193,
			
 
				+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q192[192] {
			
 
				+	  0, 255,   4, 251,   8, 247,  12, 243,  16, 239,  20, 235,  24, 231,  28, 227,
			
 
				+	 32, 223,  36, 219,  40, 215,  44, 211,  48, 207,  52, 203,  56, 199,  60, 195,
			
 
				+	 64, 191,  68, 187,  72, 183,  76, 179,  80, 175,  84, 171,  88, 167,  92, 163,
			
 
				+	 96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
			
 
				+	  1, 254,   5, 250,   9, 246,  13, 242,  17, 238,  21, 234,  25, 230,  29, 226,
			
 
				+	 33, 222,  37, 218,  41, 214,  45, 210,  49, 206,  53, 202,  57, 198,  61, 194,
			
 
				+	 65, 190,  69, 186,  73, 182,  77, 178,  81, 174,  85, 170,  89, 166,  93, 162,
			
 
				+	 97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
			
 
				+	  2, 253,   6, 249,  10, 245,  14, 241,  18, 237,  22, 233,  26, 229,  30, 225,
			
 
				+	 34, 221,  38, 217,  42, 213,  46, 209,  50, 205,  54, 201,  58, 197,  62, 193,
			
 
				+	 66, 189,  70, 185,  74, 181,  78, 177,  82, 173,  86, 169,  90, 165,  94, 161,
			
 
				+	 98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129
			
 
				+};
			
 
				+
			
 
				+static const uint8_t color_scrambled_pquant_to_uquant_q256[256] {
			
 
				+	  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
			
 
				+	 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
			
 
				+	 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
			
 
				+	 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
			
 
				+	 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
			
 
				+	 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
			
 
				+	 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
			
 
				+	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
			
 
				+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
			
 
				+	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
			
 
				+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
			
 
				+	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
			
 
				+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
			
 
				+	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
			
 
				+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
			
 
				+	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
			
 
				+};
			
 
				+
			
 
				+const uint8_t* color_scrambled_pquant_to_uquant_tables[17] {
			
 
				+	color_scrambled_pquant_to_uquant_q6,
			
 
				+	color_scrambled_pquant_to_uquant_q8,
			
 
				+	color_scrambled_pquant_to_uquant_q10,
			
 
				+	color_scrambled_pquant_to_uquant_q12,
			
 
				+	color_scrambled_pquant_to_uquant_q16,
			
 
				+	color_scrambled_pquant_to_uquant_q20,
			
 
				+	color_scrambled_pquant_to_uquant_q24,
			
 
				+	color_scrambled_pquant_to_uquant_q32,
			
 
				+	color_scrambled_pquant_to_uquant_q40,
			
 
				+	color_scrambled_pquant_to_uquant_q48,
			
 
				+	color_scrambled_pquant_to_uquant_q64,
			
 
				+	color_scrambled_pquant_to_uquant_q80,
			
 
				+	color_scrambled_pquant_to_uquant_q96,
			
 
				+	color_scrambled_pquant_to_uquant_q128,
			
 
				+	color_scrambled_pquant_to_uquant_q160,
			
 
				+	color_scrambled_pquant_to_uquant_q192,
			
 
				+	color_scrambled_pquant_to_uquant_q256
			
 
				+};
			
 
				+
			
 
				+// The quant_mode_table[integer_count/2][bits] gives us the quantization level for a given integer
			
 
				+// count and number of bits that the integer may fit into.
			
 
				+const int8_t quant_mode_table[10][128] {
			
 
				+    {
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1,  0,  0,  2,  3,  5,  6,  8,  9, 11, 12, 14, 15, 17, 18,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1, -1, -1,  0,  0,  0,  1,  2,  2,  3,  4,  5,  5,  6,  7,
			
 
				+          8,  8,  9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,
			
 
				+          4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11,
			
 
				+         12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  1,  1,  1,
			
 
				+          2,  2,  2,  3,  3,  4,  4,  4,  5,  5,  5,  6,  6,  7,  7,  7,
			
 
				+          8,  8,  8,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13,
			
 
				+         14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,
			
 
				+          1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  4,  4,  4,  4,  5,  5,
			
 
				+          5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10, 10,
			
 
				+         10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14,
			
 
				+         15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,
			
 
				+          0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
			
 
				+          4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
			
 
				+          8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11,
			
 
				+         12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
			
 
				+         16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,
			
 
				+          0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,
			
 
				+          2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,
			
 
				+          6,  6,  6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,
			
 
				+          9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13,
			
 
				+         13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
			
 
				+         16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19,
			
 
				+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
			
 
				+          2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,
			
 
				+          5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
			
 
				+          8,  8,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
			
 
				+         11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
			
 
				+         14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
			
 
				+         17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19
			
 
				+    },
			
 
				+    {
			
 
				+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+         -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
			
 
				+          1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  4,
			
 
				+          4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,
			
 
				+          6,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  9,  9,
			
 
				+          9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
			
 
				+         12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
			
 
				+         14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17
			
 
				+    }
			
 
				+};
			
--- a/thirdparty/astcenc/astcenc_symbolic_physical.cpp
+++ b/thirdparty/astcenc/astcenc_symbolic_physical.cpp
@@ -0,0 +1,534 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for converting between symbolic and physical encodings.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#include <cassert>
			
 
				+
			
 
				+/**
			
 
				+ * @brief Write up to 8 bits at an arbitrary bit offset.
			
 
				+ *
			
 
				+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so
			
 
				+ * may span two separate bytes in memory.
			
 
				+ *
			
 
				+ * @param         value       The value to write.
			
 
				+ * @param         bitcount    The number of bits to write, starting from LSB.
			
 
				+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
			
 
				+ * @param[in,out] ptr         The data pointer to write to.
			
 
				+ */
			
 
				+static inline void write_bits(
			
 
				+	int value,
			
 
				+	int bitcount,
			
 
				+	int bitoffset,
			
 
				+	uint8_t* ptr
			
 
				+) {
			
 
				+	int mask = (1 << bitcount) - 1;
			
 
				+	value &= mask;
			
 
				+	ptr += bitoffset >> 3;
			
 
				+	bitoffset &= 7;
			
 
				+	value <<= bitoffset;
			
 
				+	mask <<= bitoffset;
			
 
				+	mask = ~mask;
			
 
				+
			
 
				+	ptr[0] &= mask;
			
 
				+	ptr[0] |= value;
			
 
				+	ptr[1] &= mask >> 8;
			
 
				+	ptr[1] |= value >> 8;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Read up to 8 bits at an arbitrary bit offset.
			
 
				+ *
			
 
				+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
			
 
				+ * span two separate bytes in memory.
			
 
				+ *
			
 
				+ * @param         bitcount    The number of bits to read.
			
 
				+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
			
 
				+ * @param[in,out] ptr         The data pointer to read from.
			
 
				+ *
			
 
				+ * @return The read value.
			
 
				+ */
			
 
				+static inline int read_bits(
			
 
				+	int bitcount,
			
 
				+	int bitoffset,
			
 
				+	const uint8_t* ptr
			
 
				+) {
			
 
				+	int mask = (1 << bitcount) - 1;
			
 
				+	ptr += bitoffset >> 3;
			
 
				+	bitoffset &= 7;
			
 
				+	int value = ptr[0] | (ptr[1] << 8);
			
 
				+	value >>= bitoffset;
			
 
				+	value &= mask;
			
 
				+	return value;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Reverse bits in a byte.
			
 
				+ *
			
 
				+ * @param p   The value to reverse.
			
 
				+  *
			
 
				+ * @return The reversed result.
			
 
				+ */
			
 
				+static inline int bitrev8(int p)
			
 
				+{
			
 
				+	p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
			
 
				+	p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
			
 
				+	p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
			
 
				+	return p;
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void symbolic_to_physical(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const symbolic_compressed_block& scb,
			
 
				+	physical_compressed_block& pcb
			
 
				+) {
			
 
				+	assert(scb.block_type != SYM_BTYPE_ERROR);
			
 
				+
			
 
				+	// Constant color block using UNORM16 colors
			
 
				+	if (scb.block_type == SYM_BTYPE_CONST_U16)
			
 
				+	{
			
 
				+		// There is currently no attempt to coalesce larger void-extents
			
 
				+		static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
			
 
				+		for (unsigned int i = 0; i < 8; i++)
			
 
				+		{
			
 
				+			pcb.data[i] = cbytes[i];
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
			
 
				+		{
			
 
				+			pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
			
 
				+			pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Constant color block using FP16 colors
			
 
				+	if (scb.block_type == SYM_BTYPE_CONST_F16)
			
 
				+	{
			
 
				+		// There is currently no attempt to coalesce larger void-extents
			
 
				+		static const uint8_t cbytes[8]  { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
			
 
				+		for (unsigned int i = 0; i < 8; i++)
			
 
				+		{
			
 
				+			pcb.data[i] = cbytes[i];
			
 
				+		}
			
 
				+
			
 
				+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
			
 
				+		{
			
 
				+			pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
			
 
				+			pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int partition_count = scb.partition_count;
			
 
				+
			
 
				+	// Compress the weights.
			
 
				+	// They are encoded as an ordinary integer-sequence, then bit-reversed
			
 
				+	uint8_t weightbuf[16] { 0 };
			
 
				+
			
 
				+	const auto& bm = bsd.get_block_mode(scb.block_mode);
			
 
				+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				+	int weight_count = di.weight_count;
			
 
				+	quant_method weight_quant_method = bm.get_weight_quant_mode();
			
 
				+	float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
			
 
				+	int is_dual_plane = bm.is_dual_plane;
			
 
				+
			
 
				+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
			
 
				+
			
 
				+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
			
 
				+
			
 
				+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
			
 
				+
			
 
				+	uint8_t weights[64];
			
 
				+	if (is_dual_plane)
			
 
				+	{
			
 
				+		for (int i = 0; i < weight_count; i++)
			
 
				+		{
			
 
				+			float uqw = static_cast<float>(scb.weights[i]);
			
 
				+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
			
 
				+			int qwi = static_cast<int>(qw + 0.5f);
			
 
				+			weights[2 * i] = qat.scramble_map[qwi];
			
 
				+
			
 
				+			uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
			
 
				+			qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
			
 
				+			qwi = static_cast<int>(qw + 0.5f);
			
 
				+			weights[2 * i + 1] = qat.scramble_map[qwi];
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (int i = 0; i < weight_count; i++)
			
 
				+		{
			
 
				+			float uqw = static_cast<float>(scb.weights[i]);
			
 
				+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
			
 
				+			int qwi = static_cast<int>(qw + 0.5f);
			
 
				+			weights[i] = qat.scramble_map[qwi];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
			
 
				+
			
 
				+	for (int i = 0; i < 16; i++)
			
 
				+	{
			
 
				+		pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
			
 
				+	}
			
 
				+
			
 
				+	write_bits(scb.block_mode, 11, 0, pcb.data);
			
 
				+	write_bits(partition_count - 1, 2, 11, pcb.data);
			
 
				+
			
 
				+	int below_weights_pos = 128 - bits_for_weights;
			
 
				+
			
 
				+	// Encode partition index and color endpoint types for blocks with 2+ partitions
			
 
				+	if (partition_count > 1)
			
 
				+	{
			
 
				+		write_bits(scb.partition_index, 6, 13, pcb.data);
			
 
				+		write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data);
			
 
				+
			
 
				+		if (scb.color_formats_matched)
			
 
				+		{
			
 
				+			write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// Check endpoint types for each partition to determine the lowest class present
			
 
				+			int low_class = 4;
			
 
				+
			
 
				+			for (unsigned int i = 0; i < partition_count; i++)
			
 
				+			{
			
 
				+				int class_of_format = scb.color_formats[i] >> 2;
			
 
				+				low_class = astc::min(class_of_format, low_class);
			
 
				+			}
			
 
				+
			
 
				+			if (low_class == 3)
			
 
				+			{
			
 
				+				low_class = 2;
			
 
				+			}
			
 
				+
			
 
				+			int encoded_type = low_class + 1;
			
 
				+			int bitpos = 2;
			
 
				+
			
 
				+			for (unsigned int i = 0; i < partition_count; i++)
			
 
				+			{
			
 
				+				int classbit_of_format = (scb.color_formats[i] >> 2) - low_class;
			
 
				+				encoded_type |= classbit_of_format << bitpos;
			
 
				+				bitpos++;
			
 
				+			}
			
 
				+
			
 
				+			for (unsigned int i = 0; i < partition_count; i++)
			
 
				+			{
			
 
				+				int lowbits_of_format = scb.color_formats[i] & 3;
			
 
				+				encoded_type |= lowbits_of_format << bitpos;
			
 
				+				bitpos += 2;
			
 
				+			}
			
 
				+
			
 
				+			int encoded_type_lowpart = encoded_type & 0x3F;
			
 
				+			int encoded_type_highpart = encoded_type >> 6;
			
 
				+			int encoded_type_highpart_size = (3 * partition_count) - 4;
			
 
				+			int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
			
 
				+			write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
			
 
				+			write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data);
			
 
				+			below_weights_pos -= encoded_type_highpart_size;
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		write_bits(scb.color_formats[0], 4, 13, pcb.data);
			
 
				+	}
			
 
				+
			
 
				+	// In dual-plane mode, encode the color component of the second plane of weights
			
 
				+	if (is_dual_plane)
			
 
				+	{
			
 
				+		write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data);
			
 
				+	}
			
 
				+
			
 
				+	// Encode the color components
			
 
				+	uint8_t values_to_encode[32];
			
 
				+	int valuecount_to_encode = 0;
			
 
				+
			
 
				+	const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
			
 
				+	for (unsigned int i = 0; i < scb.partition_count; i++)
			
 
				+	{
			
 
				+		int vals = 2 * (scb.color_formats[i] >> 2) + 2;
			
 
				+		assert(vals <= 8);
			
 
				+		for (int j = 0; j < vals; j++)
			
 
				+		{
			
 
				+			values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
			
 
				+		}
			
 
				+		valuecount_to_encode += vals;
			
 
				+	}
			
 
				+
			
 
				+	encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data,
			
 
				+	           scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void physical_to_symbolic(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const physical_compressed_block& pcb,
			
 
				+	symbolic_compressed_block& scb
			
 
				+) {
			
 
				+	uint8_t bswapped[16];
			
 
				+
			
 
				+	scb.block_type = SYM_BTYPE_NONCONST;
			
 
				+
			
 
				+	// Extract header fields
			
 
				+	int block_mode = read_bits(11, 0, pcb.data);
			
 
				+	if ((block_mode & 0x1FF) == 0x1FC)
			
 
				+	{
			
 
				+		// Constant color block
			
 
				+
			
 
				+		// Check what format the data has
			
 
				+		if (block_mode & 0x200)
			
 
				+		{
			
 
				+			scb.block_type = SYM_BTYPE_CONST_F16;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			scb.block_type = SYM_BTYPE_CONST_U16;
			
 
				+		}
			
 
				+
			
 
				+		scb.partition_count = 0;
			
 
				+		for (int i = 0; i < 4; i++)
			
 
				+		{
			
 
				+			scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8);
			
 
				+		}
			
 
				+
			
 
				+		// Additionally, check that the void-extent
			
 
				+		if (bsd.zdim == 1)
			
 
				+		{
			
 
				+			// 2D void-extent
			
 
				+			int rsvbits = read_bits(2, 10, pcb.data);
			
 
				+			if (rsvbits != 3)
			
 
				+			{
			
 
				+				scb.block_type = SYM_BTYPE_ERROR;
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8);
			
 
				+			int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8);
			
 
				+			int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8);
			
 
				+			int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8);
			
 
				+
			
 
				+			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
			
 
				+
			
 
				+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
			
 
				+			{
			
 
				+				scb.block_type = SYM_BTYPE_ERROR;
			
 
				+				return;
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// 3D void-extent
			
 
				+			int vx_low_s = read_bits(9, 10, pcb.data);
			
 
				+			int vx_high_s = read_bits(9, 19, pcb.data);
			
 
				+			int vx_low_t = read_bits(9, 28, pcb.data);
			
 
				+			int vx_high_t = read_bits(9, 37, pcb.data);
			
 
				+			int vx_low_p = read_bits(9, 46, pcb.data);
			
 
				+			int vx_high_p = read_bits(9, 55, pcb.data);
			
 
				+
			
 
				+			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
			
 
				+
			
 
				+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
			
 
				+			{
			
 
				+				scb.block_type = SYM_BTYPE_ERROR;
			
 
				+				return;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int packed_index = bsd.block_mode_packed_index[block_mode];
			
 
				+	if (packed_index == BLOCK_BAD_BLOCK_MODE)
			
 
				+	{
			
 
				+		scb.block_type = SYM_BTYPE_ERROR;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	const auto& bm = bsd.get_block_mode(block_mode);
			
 
				+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				+
			
 
				+	int weight_count = di.weight_count;
			
 
				+	promise(weight_count > 0);
			
 
				+
			
 
				+	quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
			
 
				+	int is_dual_plane = bm.is_dual_plane;
			
 
				+
			
 
				+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
			
 
				+
			
 
				+	int partition_count = read_bits(2, 11, pcb.data) + 1;
			
 
				+	promise(partition_count > 0);
			
 
				+
			
 
				+	scb.block_mode = static_cast<uint16_t>(block_mode);
			
 
				+	scb.partition_count = static_cast<uint8_t>(partition_count);
			
 
				+
			
 
				+	for (int i = 0; i < 16; i++)
			
 
				+	{
			
 
				+		bswapped[i] = static_cast<uint8_t>(bitrev8(pcb.data[15 - i]));
			
 
				+	}
			
 
				+
			
 
				+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
			
 
				+
			
 
				+	int below_weights_pos = 128 - bits_for_weights;
			
 
				+
			
 
				+	uint8_t indices[64];
			
 
				+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
			
 
				+
			
 
				+	decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
			
 
				+
			
 
				+	if (is_dual_plane)
			
 
				+	{
			
 
				+		for (int i = 0; i < weight_count; i++)
			
 
				+		{
			
 
				+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
			
 
				+			scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for (int i = 0; i < weight_count; i++)
			
 
				+		{
			
 
				+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (is_dual_plane && partition_count == 4)
			
 
				+	{
			
 
				+		scb.block_type = SYM_BTYPE_ERROR;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	scb.color_formats_matched = 0;
			
 
				+
			
 
				+	// Determine the format of each endpoint pair
			
 
				+	int color_formats[BLOCK_MAX_PARTITIONS];
			
 
				+	int encoded_type_highpart_size = 0;
			
 
				+	if (partition_count == 1)
			
 
				+	{
			
 
				+		color_formats[0] = read_bits(4, 13, pcb.data);
			
 
				+		scb.partition_index = 0;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		encoded_type_highpart_size = (3 * partition_count) - 4;
			
 
				+		below_weights_pos -= encoded_type_highpart_size;
			
 
				+		int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6);
			
 
				+		int baseclass = encoded_type & 0x3;
			
 
				+		if (baseclass == 0)
			
 
				+		{
			
 
				+			for (int i = 0; i < partition_count; i++)
			
 
				+			{
			
 
				+				color_formats[i] = (encoded_type >> 2) & 0xF;
			
 
				+			}
			
 
				+
			
 
				+			below_weights_pos += encoded_type_highpart_size;
			
 
				+			scb.color_formats_matched = 1;
			
 
				+			encoded_type_highpart_size = 0;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			int bitpos = 2;
			
 
				+			baseclass--;
			
 
				+
			
 
				+			for (int i = 0; i < partition_count; i++)
			
 
				+			{
			
 
				+				color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
			
 
				+				bitpos++;
			
 
				+			}
			
 
				+
			
 
				+			for (int i = 0; i < partition_count; i++)
			
 
				+			{
			
 
				+				color_formats[i] |= (encoded_type >> bitpos) & 3;
			
 
				+				bitpos += 2;
			
 
				+			}
			
 
				+		}
			
 
				+		scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6));
			
 
				+	}
			
 
				+
			
 
				+	for (int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		scb.color_formats[i] = static_cast<uint8_t>(color_formats[i]);
			
 
				+	}
			
 
				+
			
 
				+	// Determine number of color endpoint integers
			
 
				+	int color_integer_count = 0;
			
 
				+	for (int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		int endpoint_class = color_formats[i] >> 2;
			
 
				+		color_integer_count += (endpoint_class + 1) * 2;
			
 
				+	}
			
 
				+
			
 
				+	if (color_integer_count > 18)
			
 
				+	{
			
 
				+		scb.block_type = SYM_BTYPE_ERROR;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Determine the color endpoint format to use
			
 
				+	static const int color_bits_arr[5] { -1, 115 - 4, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS };
			
 
				+	int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
			
 
				+	if (is_dual_plane)
			
 
				+	{
			
 
				+		color_bits -= 2;
			
 
				+	}
			
 
				+
			
 
				+	if (color_bits < 0)
			
 
				+	{
			
 
				+		color_bits = 0;
			
 
				+	}
			
 
				+
			
 
				+	int color_quant_level = quant_mode_table[color_integer_count >> 1][color_bits];
			
 
				+	if (color_quant_level < QUANT_6)
			
 
				+	{
			
 
				+		scb.block_type = SYM_BTYPE_ERROR;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	// Unpack the integer color values and assign to endpoints
			
 
				+	scb.quant_mode = static_cast<quant_method>(color_quant_level);
			
 
				+
			
 
				+	uint8_t values_to_decode[32];
			
 
				+	decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb.data,
			
 
				+	           values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
			
 
				+
			
 
				+	int valuecount_to_decode = 0;
			
 
				+	const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
			
 
				+	for (int i = 0; i < partition_count; i++)
			
 
				+	{
			
 
				+		int vals = 2 * (color_formats[i] >> 2) + 2;
			
 
				+		for (int j = 0; j < vals; j++)
			
 
				+		{
			
 
				+			scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
			
 
				+		}
			
 
				+		valuecount_to_decode += vals;
			
 
				+	}
			
 
				+
			
 
				+	// Fetch component for second-plane in the case of dual plane of weights.
			
 
				+	scb.plane2_component = -1;
			
 
				+	if (is_dual_plane)
			
 
				+	{
			
 
				+		scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data));
			
 
				+	}
			
 
				+}
			
--- a/thirdparty/astcenc/astcenc_vecmathlib.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib.h
@@ -0,0 +1,570 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2019-2022 Arm Limited
			
 
				+// Copyright 2008 Jose Fonseca
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/*
			
 
				+ * This module implements vector support for floats, ints, and vector lane
			
 
				+ * control masks. It provides access to both explicit vector width types, and
			
 
				+ * flexible N-wide types where N can be determined at compile time.
			
 
				+ *
			
 
				+ * The design of this module encourages use of vector length agnostic code, via
			
 
				+ * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
			
 
				+ * with that is available at compile time. The current vector width is
			
 
				+ * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
			
 
				+ *
			
 
				+ * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
			
 
				+ * These are provided primarily for prototyping and algorithm debug of VLA
			
 
				+ * implementations.
			
 
				+ *
			
 
				+ * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
			
 
				+ * types. These are provided for use by VLA code, but are also expected to be
			
 
				+ * used as a fixed-width type and will supported a reference C++ fallback for
			
 
				+ * use on platforms without SIMD intrinsics.
			
 
				+ *
			
 
				+ * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
			
 
				+ * types. These are provide for use by VLA code, and are not expected to be
			
 
				+ * used as a fixed-width type in normal code. No reference C implementation is
			
 
				+ * provided on platforms without underlying SIMD intrinsics.
			
 
				+ *
			
 
				+ * With the current implementation ISA support is provided for:
			
 
				+ *
			
 
				+ *     * 1-wide for scalar reference.
			
 
				+ *     * 4-wide for Armv8-A NEON.
			
 
				+ *     * 4-wide for x86-64 SSE2.
			
 
				+ *     * 4-wide for x86-64 SSE4.1.
			
 
				+ *     * 8-wide for x86-64 AVX2.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTC_VECMATHLIB_H_INCLUDED
			
 
				+#define ASTC_VECMATHLIB_H_INCLUDED
			
 
				+
			
 
				+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
			
 
				+	#include <immintrin.h>
			
 
				+#elif ASTCENC_NEON != 0
			
 
				+	#include <arm_neon.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(__clang__) && defined(_MSC_VER)
			
 
				+	#define ASTCENC_SIMD_INLINE __forceinline
			
 
				+	#define ASTCENC_NO_INLINE
			
 
				+#elif defined(__GNUC__) && !defined(__clang__)
			
 
				+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
			
 
				+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
			
 
				+#else
			
 
				+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
			
 
				+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_AVX >= 2
			
 
				+	/* If we have AVX2 expose 8-wide VLA. */
			
 
				+	#include "astcenc_vecmathlib_sse_4.h"
			
 
				+	#include "astcenc_vecmathlib_common_4.h"
			
 
				+	#include "astcenc_vecmathlib_avx2_8.h"
			
 
				+
			
 
				+	#define ASTCENC_SIMD_WIDTH 8
			
 
				+
			
 
				+	using vfloat = vfloat8;
			
 
				+
			
 
				+	#if defined(ASTCENC_NO_INVARIANCE)
			
 
				+		using vfloatacc = vfloat8;
			
 
				+	#else
			
 
				+		using vfloatacc = vfloat4;
			
 
				+	#endif
			
 
				+
			
 
				+	using vint = vint8;
			
 
				+	using vmask = vmask8;
			
 
				+
			
 
				+	constexpr auto loada = vfloat8::loada;
			
 
				+	constexpr auto load1 = vfloat8::load1;
			
 
				+
			
 
				+#elif ASTCENC_SSE >= 20
			
 
				+	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
			
 
				+	#include "astcenc_vecmathlib_sse_4.h"
			
 
				+	#include "astcenc_vecmathlib_common_4.h"
			
 
				+
			
 
				+	#define ASTCENC_SIMD_WIDTH 4
			
 
				+
			
 
				+	using vfloat = vfloat4;
			
 
				+	using vfloatacc = vfloat4;
			
 
				+	using vint = vint4;
			
 
				+	using vmask = vmask4;
			
 
				+
			
 
				+	constexpr auto loada = vfloat4::loada;
			
 
				+	constexpr auto load1 = vfloat4::load1;
			
 
				+
			
 
				+#elif ASTCENC_NEON > 0
			
 
				+	/* If we have NEON expose 4-wide VLA. */
			
 
				+	#include "astcenc_vecmathlib_neon_4.h"
			
 
				+	#include "astcenc_vecmathlib_common_4.h"
			
 
				+
			
 
				+	#define ASTCENC_SIMD_WIDTH 4
			
 
				+
			
 
				+	using vfloat = vfloat4;
			
 
				+	using vfloatacc = vfloat4;
			
 
				+	using vint = vint4;
			
 
				+	using vmask = vmask4;
			
 
				+
			
 
				+	constexpr auto loada = vfloat4::loada;
			
 
				+	constexpr auto load1 = vfloat4::load1;
			
 
				+
			
 
				+#else
			
 
				+	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
			
 
				+
			
 
				+	// Note: We no longer expose the 1-wide scalar fallback because it is not
			
 
				+	// invariant with the 4-wide path due to algorithms that use horizontal
			
 
				+	// operations that accumulate a local vector sum before accumulating into
			
 
				+	// a running sum.
			
 
				+	//
			
 
				+	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
			
 
				+	//
			
 
				+	//     result = ((((sum + l0) + l1) + l2) + l3)
			
 
				+	//
			
 
				+    // ... whereas the accumulator for a 4-wide vector sum is:
			
 
				+	//
			
 
				+	//     result = sum + ((l0 + l2) + (l1 + l3))
			
 
				+	//
			
 
				+	// In "normal maths" this is the same, but the floating point reassociation
			
 
				+	// differences mean that these will not produce the same result.
			
 
				+
			
 
				+	#include "astcenc_vecmathlib_none_4.h"
			
 
				+	#include "astcenc_vecmathlib_common_4.h"
			
 
				+
			
 
				+	#define ASTCENC_SIMD_WIDTH 4
			
 
				+
			
 
				+	using vfloat = vfloat4;
			
 
				+	using vfloatacc = vfloat4;
			
 
				+	using vint = vint4;
			
 
				+	using vmask = vmask4;
			
 
				+
			
 
				+	constexpr auto loada = vfloat4::loada;
			
 
				+	constexpr auto load1 = vfloat4::load1;
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * @brief Round a count down to the largest multiple of 8.
			
 
				+ *
			
 
				+ * @param count   The unrounded value.
			
 
				+ *
			
 
				+ * @return The rounded value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
			
 
				+{
			
 
				+	return count & static_cast<unsigned int>(~(8 - 1));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Round a count down to the largest multiple of 4.
			
 
				+ *
			
 
				+ * @param count   The unrounded value.
			
 
				+ *
			
 
				+ * @return The rounded value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
			
 
				+{
			
 
				+	return count & static_cast<unsigned int>(~(4 - 1));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Round a count down to the largest multiple of the SIMD width.
			
 
				+ *
			
 
				+ * Assumption that the vector width is a power of two ...
			
 
				+ *
			
 
				+ * @param count   The unrounded value.
			
 
				+ *
			
 
				+ * @return The rounded value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
			
 
				+{
			
 
				+	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Round a count up to the largest multiple of the SIMD width.
			
 
				+ *
			
 
				+ * Assumption that the vector width is a power of two ...
			
 
				+ *
			
 
				+ * @param count   The unrounded value.
			
 
				+ *
			
 
				+ * @return The rounded value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
			
 
				+{
			
 
				+	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
			
 
				+	return multiples * ASTCENC_SIMD_WIDTH;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return @c a with lanes negated if the @c b lane is negative.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
			
 
				+{
			
 
				+	vint ia = float_as_int(a);
			
 
				+	vint ib = float_as_int(b);
			
 
				+	vint sign_mask(static_cast<int>(0x80000000));
			
 
				+	vint r = ia ^ (ib & sign_mask);
			
 
				+	return int_as_float(r);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return fast, but approximate, vector atan(x).
			
 
				+ *
			
 
				+ * Max error of this implementation is 0.004883.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
			
 
				+{
			
 
				+	vmask c = abs(x) > vfloat(1.0f);
			
 
				+	vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
			
 
				+	vfloat y = select(x, vfloat(1.0f) / x, c);
			
 
				+	y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
			
 
				+	return select(y, z - y, c);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return fast, but approximate, vector atan2(x, y).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
			
 
				+{
			
 
				+	vfloat z = atan(abs(y / x));
			
 
				+	vmask xmask = vmask(float_as_int(x).m);
			
 
				+	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * @brief Factory that returns a unit length 4 component vfloat4.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 unit4()
			
 
				+{
			
 
				+	return vfloat4(0.5f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Factory that returns a unit length 3 component vfloat4.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 unit3()
			
 
				+{
			
 
				+	float val = 0.577350258827209473f;
			
 
				+	return vfloat4(val, val, val, 0.0f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Factory that returns a unit length 2 component vfloat4.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 unit2()
			
 
				+{
			
 
				+	float val = 0.707106769084930420f;
			
 
				+	return vfloat4(val, val, 0.0f, 0.0f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Factory that returns a 3 component vfloat4.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
			
 
				+{
			
 
				+	return vfloat4(a, b, c, 0.0f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Factory that returns a 2 component vfloat4.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
			
 
				+{
			
 
				+	return vfloat4(a, b, 0.0f, 0.0f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Normalize a non-zero length vector to unit length.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
			
 
				+{
			
 
				+	vfloat4 length = dot(a, a);
			
 
				+	return a / sqrt(length);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Normalize a vector, returning @c safe if len is zero.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
			
 
				+{
			
 
				+	vfloat4 length = dot(a, a);
			
 
				+	if (length.lane<0>() != 0.0f)
			
 
				+	{
			
 
				+		return a / sqrt(length);
			
 
				+	}
			
 
				+
			
 
				+	return safe;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define POLY0(x, c0)                     (                                     c0)
			
 
				+#define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
			
 
				+#define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
			
 
				+#define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
			
 
				+#define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
			
 
				+#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute an approximate exp2(x) for each lane in the vector.
			
 
				+ *
			
 
				+ * Based on 5th degree minimax polynomials, ported from this blog
			
 
				+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
			
 
				+{
			
 
				+	x = clamp(-126.99999f, 129.0f, x);
			
 
				+
			
 
				+	vint4 ipart = float_to_int(x - 0.5f);
			
 
				+	vfloat4 fpart = x - int_to_float(ipart);
			
 
				+
			
 
				+	// Integer contrib, using 1 << ipart
			
 
				+	vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
			
 
				+
			
 
				+	// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
			
 
				+	vfloat4 fexp = POLY5(fpart,
			
 
				+	                     9.9999994e-1f,
			
 
				+	                     6.9315308e-1f,
			
 
				+	                     2.4015361e-1f,
			
 
				+	                     5.5826318e-2f,
			
 
				+	                     8.9893397e-3f,
			
 
				+	                     1.8775767e-3f);
			
 
				+
			
 
				+	return iexp * fexp;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute an approximate log2(x) for each lane in the vector.
			
 
				+ *
			
 
				+ * Based on 5th degree minimax polynomials, ported from this blog
			
 
				+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
			
 
				+{
			
 
				+	vint4 exp(0x7F800000);
			
 
				+	vint4 mant(0x007FFFFF);
			
 
				+	vint4 one(0x3F800000);
			
 
				+
			
 
				+	vint4 i = float_as_int(x);
			
 
				+
			
 
				+	vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
			
 
				+
			
 
				+	vfloat4 m = int_as_float((i & mant) | one);
			
 
				+
			
 
				+	// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
			
 
				+	vfloat4 p = POLY4(m,
			
 
				+	                  2.8882704548164776201f,
			
 
				+	                 -2.52074962577807006663f,
			
 
				+	                  1.48116647521213171641f,
			
 
				+	                 -0.465725644288844778798f,
			
 
				+	                  0.0596515482674574969533f);
			
 
				+
			
 
				+	// Increases the polynomial degree, but ensures that log2(1) == 0
			
 
				+	p = p * (m - 1.0f);
			
 
				+
			
 
				+	return p + e;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute an approximate pow(x, y) for each lane in the vector.
			
 
				+ *
			
 
				+ * Power function based on the exp2(log2(x) * y) transform.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
			
 
				+{
			
 
				+	vmask4 zero_mask = y == vfloat4(0.0f);
			
 
				+	vfloat4 estimate = exp2(log2(x) * y);
			
 
				+
			
 
				+	// Guarantee that y == 0 returns exactly 1.0f
			
 
				+	return select(estimate, vfloat4(1.0f), zero_mask);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Count the leading zeros for each lane in @c a.
			
 
				+ *
			
 
				+ * Valid for all data values of @c a; will return a per-lane value [0, 32].
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
			
 
				+{
			
 
				+	// This function is a horrible abuse of floating point exponents to convert
			
 
				+	// the original integer value into a 2^N encoding we can recover easily.
			
 
				+
			
 
				+	// Convert to float without risk of rounding up by keeping only top 8 bits.
			
 
				+	// This trick is is guaranteed to keep top 8 bits and clear the 9th.
			
 
				+	a = (~lsr<8>(a)) & a;
			
 
				+	a = float_as_int(int_to_float(a));
			
 
				+
			
 
				+	// Extract and unbias exponent
			
 
				+	a = vint4(127 + 31) - lsr<23>(a);
			
 
				+
			
 
				+	// Clamp result to a valid 32-bit range
			
 
				+	return clamp(0, 32, a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanewise 2^a for each lane in @c a.
			
 
				+ *
			
 
				+ * Use of signed int means that this is only valid for values in range [0, 31].
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
			
 
				+{
			
 
				+	// 2^30 is the largest signed number than can be represented
			
 
				+	assert(all(a < vint4(31)));
			
 
				+
			
 
				+	// This function is a horrible abuse of floating point to use the exponent
			
 
				+	// and float conversion to generate a 2^N multiple.
			
 
				+
			
 
				+	// Bias the exponent
			
 
				+	vint4 exp = a + 127;
			
 
				+	exp = lsl<23>(exp);
			
 
				+
			
 
				+	// Reinterpret the bits as a float, and then convert to an int
			
 
				+	vfloat4 f = int_as_float(exp);
			
 
				+	return float_to_int(f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
			
 
				+{
			
 
				+	vint4 fp16_one = vint4(0x3C00);
			
 
				+	vint4 fp16_small = lsl<8>(p);
			
 
				+
			
 
				+	vmask4 is_one = p == vint4(0xFFFF);
			
 
				+	vmask4 is_small = p < vint4(4);
			
 
				+
			
 
				+	// Manually inline clz() on Visual Studio to avoid release build codegen bug
			
 
				+	// see https://github.com/ARM-software/astc-encoder/issues/259
			
 
				+#if !defined(__clang__) && defined(_MSC_VER)
			
 
				+	vint4 a = (~lsr<8>(p)) & p;
			
 
				+	a = float_as_int(int_to_float(a));
			
 
				+	a = vint4(127 + 31) - lsr<23>(a);
			
 
				+	vint4 lz = clamp(0, 32, a) - 16;
			
 
				+#else
			
 
				+	vint4 lz = clz(p) - 16;
			
 
				+#endif
			
 
				+
			
 
				+	p = p * two_to_the_n(lz + 1);
			
 
				+	p = p & vint4(0xFFFF);
			
 
				+
			
 
				+	p = lsr<6>(p);
			
 
				+
			
 
				+	p = p | lsl<10>(vint4(14) - lz);
			
 
				+
			
 
				+	vint4 r = select(p, fp16_one, is_one);
			
 
				+	r = select(r, fp16_small, is_small);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Convert 16-bit LNS to float16.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
			
 
				+{
			
 
				+	vint4 mc = p & 0x7FF;
			
 
				+	vint4 ec = lsr<11>(p);
			
 
				+
			
 
				+	vint4 mc_512 = mc * 3;
			
 
				+	vmask4 mask_512 = mc < vint4(512);
			
 
				+
			
 
				+	vint4 mc_1536 = mc * 4 - 512;
			
 
				+	vmask4 mask_1536 = mc < vint4(1536);
			
 
				+
			
 
				+	vint4 mc_else = mc * 5 - 2048;
			
 
				+
			
 
				+	vint4 mt = mc_else;
			
 
				+	mt = select(mt, mc_1536, mask_1536);
			
 
				+	mt = select(mt, mc_512, mask_512);
			
 
				+
			
 
				+	vint4 res = lsl<10>(ec) | lsr<3>(mt);
			
 
				+	return min(res, vint4(0x7BFF));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Extract mantissa and exponent of a float value.
			
 
				+ *
			
 
				+ * @param      a      The input value.
			
 
				+ * @param[out] exp    The output exponent.
			
 
				+ *
			
 
				+ * @return The mantissa.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
			
 
				+{
			
 
				+	// Interpret the bits as an integer
			
 
				+	vint4 ai = float_as_int(a);
			
 
				+
			
 
				+	// Extract and unbias the exponent
			
 
				+	exp = (lsr<23>(ai) & 0xFF) - 126;
			
 
				+
			
 
				+	// Extract and unbias the mantissa
			
 
				+	vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
			
 
				+	return int_as_float(manti);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Convert float to 16-bit LNS.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
			
 
				+{
			
 
				+	vint4 exp;
			
 
				+	vfloat4 mant = frexp(a, exp);
			
 
				+
			
 
				+	// Do these early before we start messing about ...
			
 
				+	vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
			
 
				+	vmask4 mask_infinity = a >= vfloat4(65536.0f);
			
 
				+
			
 
				+	// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
			
 
				+	vmask4 exp_lt_m13 = exp < vint4(-13);
			
 
				+
			
 
				+	vfloat4 a1a = a * 33554432.0f;
			
 
				+	vint4 expa = vint4::zero();
			
 
				+
			
 
				+	vfloat4 a1b = (mant - 0.5f) * 4096;
			
 
				+	vint4 expb = exp + 14;
			
 
				+
			
 
				+	a = select(a1b, a1a, exp_lt_m13);
			
 
				+	exp = select(expb, expa, exp_lt_m13);
			
 
				+
			
 
				+	vmask4 a_lt_384 = a < vfloat4(384.0f);
			
 
				+	vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
			
 
				+
			
 
				+	vfloat4 a2a = a * (4.0f / 3.0f);
			
 
				+	vfloat4 a2b = a + 128.0f;
			
 
				+	vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
			
 
				+
			
 
				+	a = a2c;
			
 
				+	a = select(a, a2b, a_lt_1408);
			
 
				+	a = select(a, a2a, a_lt_384);
			
 
				+
			
 
				+	a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
			
 
				+
			
 
				+	a = select(a, vfloat4(65535.0f), mask_infinity);
			
 
				+	a = select(a, vfloat4::zero(), mask_underflow_nan);
			
 
				+
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+namespace astc
			
 
				+{
			
 
				+
			
 
				+static ASTCENC_SIMD_INLINE float pow(float x, float y)
			
 
				+{
			
 
				+	return pow(vfloat4(x), vfloat4(y)).lane<0>();
			
 
				+}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
@@ -0,0 +1,1204 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2019-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief 8x32-bit vectors, implemented using AVX2.
			
 
				+ *
			
 
				+ * This module implements 8-wide 32-bit float, int, and mask vectors for x86
			
 
				+ * AVX2.
			
 
				+ *
			
 
				+ * There is a baseline level of functionality provided by all vector widths and
			
 
				+ * implementations. This is implemented using identical function signatures,
			
 
				+ * modulo data type, so we can use them as substitutable implementations in VLA
			
 
				+ * code.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
			
 
				+#define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
			
 
				+
			
 
				+#ifndef ASTCENC_SIMD_INLINE
			
 
				+	#error "Include astcenc_vecmathlib.h, do not include directly"
			
 
				+#endif
			
 
				+
			
 
				+#include <cstdio>
			
 
				+
			
 
				+// Define convenience intrinsics that are missing on older compilers
			
 
				+#define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat8 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 8-wide floats.
			
 
				+ */
			
 
				+struct vfloat8
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vfloat8() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with vectors if data is aligned
			
 
				+	 * to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
			
 
				+	{
			
 
				+		m = _mm256_loadu_ps(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
			
 
				+	{
			
 
				+		m = _mm256_set1_ps(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 8 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat8(
			
 
				+		float a, float b, float c, float d,
			
 
				+		float e, float f, float g, float h)
			
 
				+	{
			
 
				+		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				+	{
			
 
				+	#if !defined(__clang__) && defined(_MSC_VER)
			
 
				+		return m.m256_f32[l];
			
 
				+	#else
			
 
				+		union { __m256 m; float f[8]; } cvt;
			
 
				+		cvt.m = m;
			
 
				+		return cvt.f[l];
			
 
				+	#endif
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat8 zero()
			
 
				+	{
			
 
				+		return vfloat8(_mm256_setzero_ps());
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
			
 
				+	{
			
 
				+		return vfloat8(_mm256_broadcast_ss(p));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
			
 
				+	{
			
 
				+		return vfloat8(_mm256_load_ps(p));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
			
 
				+	{
			
 
				+		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	__m256 m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint8 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 8-wide ints.
			
 
				+ */
			
 
				+struct vint8
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vint8() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 8 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with vectors if data is aligned
			
 
				+	 * to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
			
 
				+	{
			
 
				+		m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
			
 
				+	{
			
 
				+		// _mm_loadu_si64 would be nicer syntax, but missing on older GCC
			
 
				+		m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using vfloat4::zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(int a)
			
 
				+	{
			
 
				+		m = _mm256_set1_epi32(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 8 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(
			
 
				+		int a, int b, int c, int d,
			
 
				+		int e, int f, int g, int h)
			
 
				+	{
			
 
				+		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar from a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE int lane() const
			
 
				+	{
			
 
				+	#if !defined(__clang__) && defined(_MSC_VER)
			
 
				+		return m.m256i_i32[l];
			
 
				+	#else
			
 
				+		union { __m256i m; int f[8]; } cvt;
			
 
				+		cvt.m = m;
			
 
				+		return cvt.f[l];
			
 
				+	#endif
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 zero()
			
 
				+	{
			
 
				+		return vint8(_mm256_setzero_si256());
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
			
 
				+	{
			
 
				+		__m128i a = _mm_set1_epi32(*p);
			
 
				+		return vint8(_mm256_broadcastd_epi32(a));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
			
 
				+	{
			
 
				+		return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 lane_id()
			
 
				+	{
			
 
				+		return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	__m256i m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask8 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 8-wide control plane masks.
			
 
				+ */
			
 
				+struct vmask8
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
			
 
				+	{
			
 
				+		m = _mm256_castsi256_ps(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
			
 
				+	{
			
 
				+		vint8 mask(a == false ? 0 : -1);
			
 
				+		m = _mm256_castsi256_ps(mask.m);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	__m256 m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask8 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask union (or).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_or_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask intersect (and).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_and_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask difference (xor).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_xor_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask invert (not).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
			
 
				+{
			
 
				+	return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a 8-bit mask code indicating mask status.
			
 
				+ *
			
 
				+ * bit0 = lane 0
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
			
 
				+{
			
 
				+	return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if any lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool any(vmask8 a)
			
 
				+{
			
 
				+	return mask(a) != 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if all lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool all(vmask8 a)
			
 
				+{
			
 
				+	return mask(a) == 0xFF;
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint8 operators and functions
			
 
				+// ============================================================================
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(_mm256_add_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector incremental addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
			
 
				+{
			
 
				+	a = a + b;
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(_mm256_sub_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(_mm256_mullo_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector bit invert.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
			
 
				+{
			
 
				+	return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise or.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(_mm256_or_si256(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise and.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(_mm256_and_si256(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise xor.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(_mm256_xor_si256(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift left.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
			
 
				+{
			
 
				+	return vint8(_mm256_slli_epi32(a.m, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Arithmetic shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
			
 
				+{
			
 
				+	return vint8(_mm256_srai_epi32(a.m, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
			
 
				+{
			
 
				+	return vint8(_mm256_srli_epi32(a.m, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(_mm256_min_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(_mm256_max_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
			
 
				+{
			
 
				+	__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
			
 
				+	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
			
 
				+	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
			
 
				+	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
			
 
				+
			
 
				+	__m256i r = astcenc_mm256_set_m128i(m, m);
			
 
				+	vint8 vmin(r);
			
 
				+	return vmin;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
			
 
				+{
			
 
				+	__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
			
 
				+	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
			
 
				+	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
			
 
				+	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
			
 
				+
			
 
				+	__m256i r = astcenc_mm256_set_m128i(m, m);
			
 
				+	vint8 vmax(r);
			
 
				+	return vmax;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to a 16B aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
			
 
				+{
			
 
				+	_mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
			
 
				+{
			
 
				+	_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store lowest N (vector width) bytes into an unaligned address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
			
 
				+{
			
 
				+	// This is the most logical implementation, but the convenience intrinsic
			
 
				+	// is missing on older compilers (supported in g++ 9 and clang++ 9).
			
 
				+	// _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
			
 
				+	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Gather N (vector width) indices from the array.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
			
 
				+{
			
 
				+	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
			
 
				+{
			
 
				+	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
			
 
				+	                               0, 0, 0, 0, 28, 24, 20, 16,
			
 
				+	                               0, 0, 0, 0,  0,  0,  0,  0,
			
 
				+	                               0, 0, 0, 0, 12,  8,  4,  0);
			
 
				+	__m256i a = _mm256_shuffle_epi8(v.m, shuf);
			
 
				+	__m128i a0 = _mm256_extracti128_si256(a, 0);
			
 
				+	__m128i a1 = _mm256_extracti128_si256(a, 1);
			
 
				+	__m128i b = _mm_unpacklo_epi32(a0, a1);
			
 
				+
			
 
				+	__m256i r = astcenc_mm256_set_m128i(b, b);
			
 
				+	return vint8(r);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
			
 
				+{
			
 
				+	__m256i condi = _mm256_castps_si256(cond.m);
			
 
				+	return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_add_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector incremental addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
			
 
				+{
			
 
				+	a = a + b;
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_sub_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_mul_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: scalar by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_div_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: scalar by vector division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_min_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of a vector and a scalar.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
			
 
				+{
			
 
				+	return min(a, vfloat8(b));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(_mm256_max_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of a vector and a scalar.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
			
 
				+{
			
 
				+	return max(a, vfloat8(b));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the clamped value between min and max.
			
 
				+ *
			
 
				+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
			
 
				+ * then @c min will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
			
 
				+{
			
 
				+	// Do not reorder - second operand will return if either is NaN
			
 
				+	a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
			
 
				+	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a clamped value between 0.0f and max.
			
 
				+ *
			
 
				+ * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
			
 
				+ * be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
			
 
				+{
			
 
				+	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
			
 
				+	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a clamped value between 0.0f and 1.0f.
			
 
				+ *
			
 
				+ * If @c a is NaN then zero will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
			
 
				+{
			
 
				+	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
			
 
				+	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the absolute value of the float vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
			
 
				+{
			
 
				+	__m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
			
 
				+	return vfloat8(_mm256_and_ps(a.m, msk));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float rounded to the nearest integer value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
			
 
				+{
			
 
				+	constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
			
 
				+	return vfloat8(_mm256_round_ps(a.m, flags));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
			
 
				+{
			
 
				+	__m128 vlow = _mm256_castps256_ps128(a.m);
			
 
				+	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
			
 
				+	vlow = _mm_min_ps(vlow, vhigh);
			
 
				+
			
 
				+	// First do an horizontal reduction.
			
 
				+	__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
			
 
				+	__m128 mins = _mm_min_ps(vlow, shuf);
			
 
				+	shuf = _mm_movehl_ps(shuf, mins);
			
 
				+	mins = _mm_min_ss(mins, shuf);
			
 
				+
			
 
				+	// This is the most logical implementation, but the convenience intrinsic
			
 
				+	// is missing on older compilers (supported in g++ 9 and clang++ 9).
			
 
				+	//__m256i r = _mm256_set_m128(m, m)
			
 
				+	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
			
 
				+
			
 
				+	return vfloat8(_mm256_permute_ps(r, 0));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
			
 
				+{
			
 
				+	return hmin(a).lane<0>();
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
			
 
				+{
			
 
				+	__m128 vlow = _mm256_castps256_ps128(a.m);
			
 
				+	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
			
 
				+	vhigh = _mm_max_ps(vlow, vhigh);
			
 
				+
			
 
				+	// First do an horizontal reduction.
			
 
				+	__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
			
 
				+	__m128 maxs = _mm_max_ps(vhigh, shuf);
			
 
				+	shuf = _mm_movehl_ps(shuf,maxs);
			
 
				+	maxs = _mm_max_ss(maxs, shuf);
			
 
				+
			
 
				+	// This is the most logical implementation, but the convenience intrinsic
			
 
				+	// is missing on older compilers (supported in g++ 9 and clang++ 9).
			
 
				+	//__m256i r = _mm256_set_m128(m, m)
			
 
				+	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
			
 
				+	return vfloat8(_mm256_permute_ps(r, 0));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
			
 
				+{
			
 
				+	return hmax(a).lane<0>();
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
			
 
				+{
			
 
				+	// Two sequential 4-wide adds gives invariance with 4-wide code
			
 
				+	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
			
 
				+	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
			
 
				+	return hadd_s(lo) + hadd_s(hi);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
			
 
				+{
			
 
				+	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
			
 
				+{
			
 
				+	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
			
 
				+ *
			
 
				+ * This is invariant with 4-wide implementations.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
			
 
				+{
			
 
				+	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
			
 
				+	haccumulate(accum, lo);
			
 
				+
			
 
				+	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
			
 
				+	haccumulate(accum, hi);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate lane-wise sums for a vector.
			
 
				+ *
			
 
				+ * This is NOT invariant with 4-wide implementations.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
			
 
				+{
			
 
				+	accum += a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
			
 
				+ *
			
 
				+ * This is invariant with 4-wide implementations.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
			
 
				+{
			
 
				+	a = select(vfloat8::zero(), a, m);
			
 
				+	haccumulate(accum, a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate masked lane-wise sums for a vector.
			
 
				+ *
			
 
				+ * This is NOT invariant with 4-wide implementations.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
			
 
				+{
			
 
				+	a = select(vfloat8::zero(), a, m);
			
 
				+	haccumulate(accum, a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the sqrt of the lanes in the vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
			
 
				+{
			
 
				+	return vfloat8(_mm256_sqrt_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array;
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
			
 
				+{
			
 
				+	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
			
 
				+{
			
 
				+	_mm256_storeu_ps(p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to a 32B aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
			
 
				+{
			
 
				+	_mm256_store_ps(p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using truncation.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
			
 
				+{
			
 
				+	return vint8(_mm256_cvttps_epi32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using round-to-nearest.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
			
 
				+{
			
 
				+	a = round(a);
			
 
				+	return vint8(_mm256_cvttps_epi32(a.m));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for an integer vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
			
 
				+{
			
 
				+	return vfloat8(_mm256_cvtepi32_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the first half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
			
 
				+{
			
 
				+	return vint8(_mm256_castps_si256(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the second half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
			
 
				+{
			
 
				+	return vfloat8(_mm256_castsi256_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
			
 
				+{
			
 
				+	// AVX2 duplicates the table within each 128-bit lane
			
 
				+	__m128i t0n = t0.m;
			
 
				+	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
			
 
				+{
			
 
				+	// AVX2 duplicates the table within each 128-bit lane
			
 
				+	__m128i t0n = t0.m;
			
 
				+	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
			
 
				+
			
 
				+	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
			
 
				+	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
			
 
				+	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
			
 
				+{
			
 
				+	// AVX2 duplicates the table within each 128-bit lane
			
 
				+	__m128i t0n = t0.m;
			
 
				+	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
			
 
				+
			
 
				+	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
			
 
				+	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
			
 
				+
			
 
				+	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
			
 
				+	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
			
 
				+
			
 
				+	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
			
 
				+	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
			
 
				+{
			
 
				+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				+	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				+
			
 
				+	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
			
 
				+	return vint8(result);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
			
 
				+{
			
 
				+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				+	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				+
			
 
				+	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
			
 
				+	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
			
 
				+
			
 
				+	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
			
 
				+	result = _mm256_xor_si256(result, result2);
			
 
				+	return vint8(result);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
			
 
				+{
			
 
				+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				+	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				+
			
 
				+	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
			
 
				+	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
			
 
				+
			
 
				+	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
			
 
				+	result = _mm256_xor_si256(result, result2);
			
 
				+	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
			
 
				+
			
 
				+	result2 = _mm256_shuffle_epi8(t2.m, idxx);
			
 
				+	result = _mm256_xor_si256(result, result2);
			
 
				+	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
			
 
				+
			
 
				+	result2 = _mm256_shuffle_epi8(t3.m, idxx);
			
 
				+	result = _mm256_xor_si256(result, result2);
			
 
				+
			
 
				+	return vint8(result);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a vector of interleaved RGBA data.
			
 
				+ *
			
 
				+ * Input vectors have the value stored in the bottom 8 bits of each lane,
			
 
				+ * with high  bits set to zero.
			
 
				+ *
			
 
				+ * Output vector stores a single RGBA texel packed in each lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
			
 
				+{
			
 
				+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector, skipping masked lanes.
			
 
				+ *
			
 
				+ * All masked lanes must be at the end of vector, after all non-masked lanes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask)
			
 
				+{
			
 
				+	_mm256_maskstore_epi32(base, _mm256_castps_si256(mask.m), data.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of ints.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vint8 a)
			
 
				+{
			
 
				+	alignas(ASTCENC_VECALIGN) int v[8];
			
 
				+	storea(a, v);
			
 
				+	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
			
 
				+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of ints.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void printx(vint8 a)
			
 
				+{
			
 
				+	alignas(ASTCENC_VECALIGN) int v[8];
			
 
				+	storea(a, v);
			
 
				+	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
			
 
				+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of floats.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vfloat8 a)
			
 
				+{
			
 
				+	alignas(ASTCENC_VECALIGN) float v[8];
			
 
				+	storea(a, v);
			
 
				+	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
			
 
				+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
			
 
				+	       static_cast<double>(v[2]), static_cast<double>(v[3]),
			
 
				+	       static_cast<double>(v[4]), static_cast<double>(v[5]),
			
 
				+	       static_cast<double>(v[6]), static_cast<double>(v[7]));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of masks.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vmask8 a)
			
 
				+{
			
 
				+	print(select(vint8(0), vint8(1), a));
			
 
				+}
			
 
				+
			
 
				+#endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
@@ -0,0 +1,423 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2020-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Generic 4x32-bit vector functions.
			
 
				+ *
			
 
				+ * This module implements generic 4-wide vector functions that are valid for
			
 
				+ * all instruction sets, typically implemented using lower level 4-wide
			
 
				+ * operations that are ISA-specific.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
			
 
				+#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
			
 
				+
			
 
				+#ifndef ASTCENC_SIMD_INLINE
			
 
				+	#error "Include astcenc_vecmathlib.h, do not include directly"
			
 
				+#endif
			
 
				+
			
 
				+#include <cstdio>
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if any lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool any(vmask4 a)
			
 
				+{
			
 
				+	return mask(a) != 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if all lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool all(vmask4 a)
			
 
				+{
			
 
				+	return mask(a) == 0xF;
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
			
 
				+{
			
 
				+	return a + vint4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector incremental addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
			
 
				+{
			
 
				+	a = a + b;
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
			
 
				+{
			
 
				+	return a - vint4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
			
 
				+{
			
 
				+	return a * vint4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar bitwise or.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
			
 
				+{
			
 
				+	return a | vint4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar bitwise and.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
			
 
				+{
			
 
				+	return a & vint4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar bitwise xor.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
			
 
				+{
			
 
				+	return a ^ vint4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the clamped value between min and max.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
			
 
				+{
			
 
				+	return min(max(a, vint4(minv)), vint4(maxv));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
			
 
				+{
			
 
				+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector incremental addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
			
 
				+{
			
 
				+	a = a + b;
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
			
 
				+{
			
 
				+	return a + vfloat4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
			
 
				+{
			
 
				+	return a - vfloat4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
			
 
				+{
			
 
				+	return a * vfloat4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: scalar by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(a) * b;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
			
 
				+{
			
 
				+	return a / vfloat4(b);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: scalar by vector division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(a) / b;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of a vector and a scalar.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
			
 
				+{
			
 
				+	return min(a, vfloat4(b));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of a vector and a scalar.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
			
 
				+{
			
 
				+	return max(a, vfloat4(b));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the clamped value between min and max.
			
 
				+ *
			
 
				+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
			
 
				+ * then @c min will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
			
 
				+{
			
 
				+	// Do not reorder - second operand will return if either is NaN
			
 
				+	return min(max(a, minv), maxv);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the clamped value between 0.0f and max.
			
 
				+ *
			
 
				+ * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
			
 
				+ * be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
			
 
				+{
			
 
				+	// Do not reorder - second operand will return if either is NaN
			
 
				+	return min(max(a, vfloat4::zero()), maxv);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the clamped value between 0.0f and 1.0f.
			
 
				+ *
			
 
				+ * If @c a is NaN then zero will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
			
 
				+{
			
 
				+	// Do not reorder - second operand will return if either is NaN
			
 
				+	return min(max(a, vfloat4::zero()), 1.0f);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
			
 
				+{
			
 
				+	return hmin(a).lane<0>();
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal min of RGB vector lanes as a scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
			
 
				+{
			
 
				+	a.set_lane<3>(a.lane<0>());
			
 
				+	return hmin_s(a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
			
 
				+{
			
 
				+	return hmax(a).lane<0>();
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate lane-wise sums for a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
			
 
				+{
			
 
				+	accum = accum + a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate lane-wise sums for a masked vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
			
 
				+{
			
 
				+	a = select(vfloat4::zero(), a, m);
			
 
				+	haccumulate(accum, a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
			
 
				+{
			
 
				+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
			
 
				+}
			
 
				+
			
 
				+#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the dot product for the full 4 lanes, returning scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	vfloat4 m = a * b;
			
 
				+	return hadd_s(m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the dot product for the full 4 lanes, returning vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	vfloat4 m = a * b;
			
 
				+	return vfloat4(hadd_s(m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the dot product for the bottom 3 lanes, returning scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	vfloat4 m = a * b;
			
 
				+	return hadd_rgb_s(m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the dot product for the bottom 3 lanes, returning vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	vfloat4 m = a * b;
			
 
				+	float d3 = hadd_rgb_s(m);
			
 
				+	return vfloat4(d3, d3, d3, 0.0f);
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Population bit count.
			
 
				+ *
			
 
				+ * @param v   The value to population count.
			
 
				+ *
			
 
				+ * @return The number of 1 bits.
			
 
				+ */
			
 
				+static inline int popcount(uint64_t v)
			
 
				+{
			
 
				+	uint64_t mask1 = 0x5555555555555555ULL;
			
 
				+	uint64_t mask2 = 0x3333333333333333ULL;
			
 
				+	uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
			
 
				+	v -= (v >> 1) & mask1;
			
 
				+	v = (v & mask2) + ((v >> 2) & mask2);
			
 
				+	v += v >> 4;
			
 
				+	v &= mask3;
			
 
				+	v *= 0x0101010101010101ULL;
			
 
				+	v >>= 56;
			
 
				+	return static_cast<int>(v);
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * @brief Apply signed bit transfer.
			
 
				+ *
			
 
				+ * @param input0   The first encoded endpoint.
			
 
				+ * @param input1   The second encoded endpoint.
			
 
				+ */
			
 
				+static ASTCENC_SIMD_INLINE void bit_transfer_signed(
			
 
				+	vint4& input0,
			
 
				+	vint4& input1
			
 
				+) {
			
 
				+	input1 = lsr<1>(input1) | (input0 & 0x80);
			
 
				+	input0 = lsr<1>(input0) & 0x3F;
			
 
				+
			
 
				+	vmask4 mask = (input0 & 0x20) != vint4::zero();
			
 
				+	input0 = select(input0, input0 - 0x40, mask);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of ints.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vint4 a)
			
 
				+{
			
 
				+	alignas(16) int v[4];
			
 
				+	storea(a, v);
			
 
				+	printf("v4_i32:\n  %8d %8d %8d %8d\n",
			
 
				+	       v[0], v[1], v[2], v[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of ints.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void printx(vint4 a)
			
 
				+{
			
 
				+	alignas(16) int v[4];
			
 
				+	storea(a, v);
			
 
				+	printf("v4_i32:\n  %08x %08x %08x %08x\n",
			
 
				+	       v[0], v[1], v[2], v[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of floats.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vfloat4 a)
			
 
				+{
			
 
				+	alignas(16) float v[4];
			
 
				+	storea(a, v);
			
 
				+	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
			
 
				+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
			
 
				+	       static_cast<double>(v[2]), static_cast<double>(v[3]));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of masks.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vmask4 a)
			
 
				+{
			
 
				+	print(select(vint4(0), vint4(1), a));
			
 
				+}
			
 
				+
			
 
				+#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
@@ -0,0 +1,1072 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2019-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
			
 
				+ *
			
 
				+ * This module implements 4-wide 32-bit float, int, and mask vectors for
			
 
				+ * Armv8-A NEON.
			
 
				+ *
			
 
				+ * There is a baseline level of functionality provided by all vector widths and
			
 
				+ * implementations. This is implemented using identical function signatures,
			
 
				+ * modulo data type, so we can use them as substitutable implementations in VLA
			
 
				+ * code.
			
 
				+ *
			
 
				+ * The 4-wide vectors are also used as a fixed-width type, and significantly
			
 
				+ * extend the functionality above that available to VLA code.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
			
 
				+#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
			
 
				+
			
 
				+#ifndef ASTCENC_SIMD_INLINE
			
 
				+	#error "Include astcenc_vecmathlib.h, do not include directly"
			
 
				+#endif
			
 
				+
			
 
				+#include <cstdio>
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide floats.
			
 
				+ */
			
 
				+struct vfloat4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vfloat4() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with vectors if data is aligned
			
 
				+	 * to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
			
 
				+	{
			
 
				+		m = vld1q_f32(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
			
 
				+	{
			
 
				+		m = vdupq_n_f32(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
			
 
				+	{
			
 
				+		float v[4] { a, b, c, d };
			
 
				+		m = vld1q_f32(v);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				+	{
			
 
				+		return vgetq_lane_f32(m, l);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Set the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
			
 
				+	{
			
 
				+		m = vsetq_lane_f32(a, m, l);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 zero()
			
 
				+	{
			
 
				+		return vfloat4(vdupq_n_f32(0.0f));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
			
 
				+	{
			
 
				+		return vfloat4(vld1q_dup_f32(p));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
			
 
				+	{
			
 
				+		return vfloat4(vld1q_f32(p));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
			
 
				+	{
			
 
				+		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
			
 
				+		return vfloat4(vld1q_f32(data));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 2.
			
 
				+	 */
			
 
				+	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 3.
			
 
				+	 */
			
 
				+	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 4.
			
 
				+	 */
			
 
				+	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	float32x4_t m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide ints.
			
 
				+ */
			
 
				+struct vint4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vint4() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with vectors if data is aligned
			
 
				+	 * to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
			
 
				+	{
			
 
				+		m = vld1q_s32(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
			
 
				+	{
			
 
				+		// Cast is safe - NEON loads are allowed to be unaligned
			
 
				+		uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
			
 
				+		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
			
 
				+		m = vreinterpretq_s32_u32(vmovl_u16(t16));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using vfloat4::zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(int a)
			
 
				+	{
			
 
				+		m = vdupq_n_s32(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
			
 
				+	{
			
 
				+		int v[4] { a, b, c, d };
			
 
				+		m = vld1q_s32(v);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar from a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE int lane() const
			
 
				+	{
			
 
				+		return vgetq_lane_s32(m, l);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Set the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
			
 
				+	{
			
 
				+		m = vsetq_lane_s32(a, m, l);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 zero()
			
 
				+	{
			
 
				+		return vint4(0);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
			
 
				+	{
			
 
				+		return vint4(*p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
			
 
				+	{
			
 
				+		return vint4(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 lane_id()
			
 
				+	{
			
 
				+		alignas(16) static const int data[4] { 0, 1, 2, 3 };
			
 
				+		return vint4(vld1q_s32(data));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	int32x4_t m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide control plane masks.
			
 
				+ */
			
 
				+struct vmask4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+#if !defined(_MSC_VER)
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
			
 
				+	{
			
 
				+		m = vreinterpretq_u32_s32(a);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
			
 
				+	{
			
 
				+		m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
			
 
				+	{
			
 
				+		int v[4] {
			
 
				+			a == true ? -1 : 0,
			
 
				+			b == true ? -1 : 0,
			
 
				+			c == true ? -1 : 0,
			
 
				+			d == true ? -1 : 0
			
 
				+		};
			
 
				+
			
 
				+		int32x4_t ms = vld1q_s32(v);
			
 
				+		m = vreinterpretq_u32_s32(ms);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar from a single lane.
			
 
				+	 */
			
 
				+	template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
			
 
				+	{
			
 
				+		return vgetq_lane_u32(m, l);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	uint32x4_t m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask union (or).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(vorrq_u32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask intersect (and).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(vandq_u32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask difference (xor).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(veorq_u32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask invert (not).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
			
 
				+{
			
 
				+	return vmask4(vmvnq_u32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a 4-bit mask code indicating mask status.
			
 
				+ *
			
 
				+ * bit0 = lane 0
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
			
 
				+{
			
 
				+	static const int shifta[4] { 0, 1, 2, 3 };
			
 
				+	static const int32x4_t shift = vld1q_s32(shifta);
			
 
				+
			
 
				+	uint32x4_t tmp = vshrq_n_u32(a.m, 31);
			
 
				+	return vaddvq_u32(vshlq_u32(tmp, shift));
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(vaddq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(vsubq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(vmulq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector bit invert.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
			
 
				+{
			
 
				+	return vint4(vmvnq_s32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise or.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(vorrq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise and.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(vandq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise xor.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(veorq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(vceqq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return ~vmask4(vceqq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(vcltq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(vcgtq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift left.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
			
 
				+{
			
 
				+	return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
			
 
				+{
			
 
				+	uint32x4_t ua = vreinterpretq_u32_s32(a.m);
			
 
				+	ua = vshlq_u32(ua, vdupq_n_s32(-s));
			
 
				+	return vint4(vreinterpretq_s32_u32(ua));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Arithmetic shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
			
 
				+{
			
 
				+	return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(vminq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(vmaxq_s32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
			
 
				+{
			
 
				+	return vint4(vminvq_s32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
			
 
				+{
			
 
				+	return vint4(vmaxvq_s32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
			
 
				+{
			
 
				+	int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
			
 
				+	return vget_lane_s32(vpadd_s32(t, t), 0);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to a 16B aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
			
 
				+{
			
 
				+	vst1q_s32(p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
			
 
				+{
			
 
				+	vst1q_s32(p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store lowest N (vector width) bytes into an unaligned address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
			
 
				+{
			
 
				+	vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Gather N (vector width) indices from the array.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
			
 
				+{
			
 
				+	alignas(16) int idx[4];
			
 
				+	storea(indices, idx);
			
 
				+	alignas(16) int vals[4];
			
 
				+	vals[0] = base[idx[0]];
			
 
				+	vals[1] = base[idx[1]];
			
 
				+	vals[2] = base[idx[2]];
			
 
				+	vals[3] = base[idx[3]];
			
 
				+	return vint4(vals);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
			
 
				+{
			
 
				+	alignas(16) uint8_t shuf[16] {
			
 
				+		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
			
 
				+	};
			
 
				+	uint8x16_t idx = vld1q_u8(shuf);
			
 
				+	int8x16_t av = vreinterpretq_s8_s32(a.m);
			
 
				+	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
			
 
				+{
			
 
				+	return vint4(vbslq_s32(cond.m, b.m, a.m));
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(vaddq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(vsubq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(vmulq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(vdivq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(vceqq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(vcltq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(vcgtq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(vcleq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(vcgeq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	// Do not reorder - second operand will return if either is NaN
			
 
				+	return vfloat4(vminnmq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	// Do not reorder - second operand will return if either is NaN
			
 
				+	return vfloat4(vmaxnmq_f32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the absolute value of the float vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
			
 
				+{
			
 
				+	float32x4_t zero = vdupq_n_f32(0.0f);
			
 
				+	float32x4_t inv = vsubq_f32(zero, a.m);
			
 
				+	return vfloat4(vmaxq_f32(a.m, inv));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float rounded to the nearest integer value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
			
 
				+{
			
 
				+	return vfloat4(vrndnq_f32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
			
 
				+{
			
 
				+	return vfloat4(vminvq_f32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
			
 
				+{
			
 
				+	return vfloat4(vmaxvq_f32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
			
 
				+{
			
 
				+	// Perform halving add to ensure invariance; we cannot use vaddqv as this
			
 
				+	// does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
			
 
				+	float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
			
 
				+	return vget_lane_f32(vpadd_f32(t, t), 0);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the sqrt of the lanes in the vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
			
 
				+{
			
 
				+	return vfloat4(vsqrtq_f32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				+{
			
 
				+	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				+{
			
 
				+	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
			
 
				+	uint32x4_t mask = vcgeq_u32(cond.m, msb);
			
 
				+	return vfloat4(vbslq_f32(mask, b.m, a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array;
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
			
 
				+{
			
 
				+	alignas(16) int idx[4];
			
 
				+	storea(indices, idx);
			
 
				+	alignas(16) float vals[4];
			
 
				+	vals[0] = base[idx[0]];
			
 
				+	vals[1] = base[idx[1]];
			
 
				+	vals[2] = base[idx[2]];
			
 
				+	vals[3] = base[idx[3]];
			
 
				+	return vfloat4(vals);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
			
 
				+{
			
 
				+	vst1q_f32(p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to a 16B aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
			
 
				+{
			
 
				+	vst1q_f32(p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using truncation.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
			
 
				+{
			
 
				+	return vint4(vcvtq_s32_f32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using round-to-nearest.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
			
 
				+{
			
 
				+	a = round(a);
			
 
				+	return vint4(vcvtq_s32_f32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for an integer vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
			
 
				+{
			
 
				+	return vfloat4(vcvtq_f32_s32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float16 value for a float vector, using round-to-nearest.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
			
 
				+{
			
 
				+	// Generate float16 value
			
 
				+	float16x4_t f16 = vcvt_f16_f32(a.m);
			
 
				+
			
 
				+	// Convert each 16-bit float pattern to a 32-bit pattern
			
 
				+	uint16x4_t u16 = vreinterpret_u16_f16(f16);
			
 
				+	uint32x4_t u32 = vmovl_u16(u16);
			
 
				+	return vint4(vreinterpretq_s32_u32(u32));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float16 value for a float scalar, using round-to-nearest.
			
 
				+ */
			
 
				+static inline uint16_t float_to_float16(float a)
			
 
				+{
			
 
				+	vfloat4 av(a);
			
 
				+	return static_cast<uint16_t>(float_to_float16(av).lane<0>());
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for a float16 vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
			
 
				+{
			
 
				+	// Convert each 32-bit float pattern to a 16-bit pattern
			
 
				+	uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
			
 
				+	uint16x4_t u16 = vmovn_u32(u32);
			
 
				+	float16x4_t f16 = vreinterpret_f16_u16(u16);
			
 
				+
			
 
				+	// Generate float16 value
			
 
				+	return vfloat4(vcvt_f32_f16(f16));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for a float16 scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
			
 
				+{
			
 
				+	vint4 av(a);
			
 
				+	return float16_to_float(av).lane<0>();
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the first half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
			
 
				+{
			
 
				+	return vint4(vreinterpretq_s32_f32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the second half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
			
 
				+{
			
 
				+	return vfloat4(vreinterpretq_f32_s32(v.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
			
 
				+{
			
 
				+	t0p = t0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
			
 
				+{
			
 
				+	t0p = t0;
			
 
				+	t1p = t1;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
			
 
				+	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
			
 
				+{
			
 
				+	t0p = t0;
			
 
				+	t1p = t1;
			
 
				+	t2p = t2;
			
 
				+	t3p = t3;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
			
 
				+{
			
 
				+	int8x16_t table {
			
 
				+		vreinterpretq_s8_s32(t0.m)
			
 
				+	};
			
 
				+
			
 
				+	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				+	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
			
 
				+	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
			
 
				+
			
 
				+	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
			
 
				+{
			
 
				+	int8x16x2_t table {
			
 
				+		vreinterpretq_s8_s32(t0.m),
			
 
				+		vreinterpretq_s8_s32(t1.m)
			
 
				+	};
			
 
				+
			
 
				+	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				+	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
			
 
				+	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
			
 
				+
			
 
				+	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
			
 
				+{
			
 
				+	int8x16x4_t table {
			
 
				+		vreinterpretq_s8_s32(t0.m),
			
 
				+		vreinterpretq_s8_s32(t1.m),
			
 
				+		vreinterpretq_s8_s32(t2.m),
			
 
				+		vreinterpretq_s8_s32(t3.m)
			
 
				+	};
			
 
				+
			
 
				+	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				+	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
			
 
				+	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
			
 
				+
			
 
				+	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a vector of interleaved RGBA data.
			
 
				+ *
			
 
				+ * Input vectors have the value stored in the bottom 8 bits of each lane,
			
 
				+ * with high  bits set to zero.
			
 
				+ *
			
 
				+ * Output vector stores a single RGBA texel packed in each lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
			
 
				+{
			
 
				+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector, skipping masked lanes.
			
 
				+ *
			
 
				+ * All masked lanes must be at the end of vector, after all non-masked lanes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
			
 
				+{
			
 
				+	if (mask.lane<3>())
			
 
				+	{
			
 
				+		store(data, base);
			
 
				+	}
			
 
				+	else if (mask.lane<2>())
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+		base[1] = data.lane<1>();
			
 
				+		base[2] = data.lane<2>();
			
 
				+	}
			
 
				+	else if (mask.lane<1>())
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+		base[1] = data.lane<1>();
			
 
				+	}
			
 
				+	else if (mask.lane<0>())
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#define ASTCENC_USE_NATIVE_POPCOUNT 1
			
 
				+
			
 
				+/**
			
 
				+ * @brief Population bit count.
			
 
				+ *
			
 
				+ * @param v   The value to population count.
			
 
				+ *
			
 
				+ * @return The number of 1 bits.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int popcount(uint64_t v)
			
 
				+{
			
 
				+	return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
			
 
				+}
			
 
				+
			
 
				+#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
@@ -0,0 +1,1169 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2019-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief 4x32-bit vectors, implemented using plain C++.
			
 
				+ *
			
 
				+ * This module implements 4-wide 32-bit float, int, and mask vectors. This
			
 
				+ * module provides a scalar fallback for VLA code, primarily useful for
			
 
				+ * debugging VLA algorithms without the complexity of handling SIMD. Only the
			
 
				+ * baseline level of functionality needed to support VLA is provided.
			
 
				+ *
			
 
				+ * Note that the vector conditional operators implemented by this module are
			
 
				+ * designed to behave like SIMD conditional operators that generate lane masks.
			
 
				+ * Rather than returning 0/1 booleans like normal C++ code they will return
			
 
				+ * 0/-1 to give a full lane-width bitmask.
			
 
				+ *
			
 
				+ * Note that the documentation for this module still talks about "vectors" to
			
 
				+ * help developers think about the implied VLA behavior when writing optimized
			
 
				+ * paths.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
			
 
				+#define ASTC_VECMATHLIB_NONE_4_H_INCLUDED
			
 
				+
			
 
				+#ifndef ASTCENC_SIMD_INLINE
			
 
				+	#error "Include astcenc_vecmathlib.h, do not include directly"
			
 
				+#endif
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <cstdio>
			
 
				+#include <cstring>
			
 
				+#include <cfenv>
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide floats.
			
 
				+ */
			
 
				+struct vfloat4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vfloat4() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with wider VLA vectors if data is
			
 
				+	 * aligned to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(const float* p)
			
 
				+	{
			
 
				+		m[0] = p[0];
			
 
				+		m[1] = p[1];
			
 
				+		m[2] = p[2];
			
 
				+		m[3] = p[3];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
			
 
				+	{
			
 
				+		m[0] = a;
			
 
				+		m[1] = a;
			
 
				+		m[2] = a;
			
 
				+		m[3] = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
			
 
				+	{
			
 
				+		m[0] = a;
			
 
				+		m[1] = b;
			
 
				+		m[2] = c;
			
 
				+		m[3] = d;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				+	{
			
 
				+		return m[l];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Set the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
			
 
				+	{
			
 
				+		m[l] = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 zero()
			
 
				+	{
			
 
				+		return vfloat4(0.0f);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
			
 
				+	{
			
 
				+		return vfloat4(*p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
			
 
				+	{
			
 
				+		return vfloat4(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
			
 
				+	{
			
 
				+		return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 2.
			
 
				+	 */
			
 
				+	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		return  vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 3.
			
 
				+	 */
			
 
				+	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 4.
			
 
				+	 */
			
 
				+	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	float m[4];
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide ints.
			
 
				+ */
			
 
				+struct vint4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vint4() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using vint4::loada() which is better with wider VLA vectors
			
 
				+	 * if data is aligned.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(const int* p)
			
 
				+	{
			
 
				+		m[0] = p[0];
			
 
				+		m[1] = p[1];
			
 
				+		m[2] = p[2];
			
 
				+		m[3] = p[3];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
			
 
				+	{
			
 
				+		m[0] = p[0];
			
 
				+		m[1] = p[1];
			
 
				+		m[2] = p[2];
			
 
				+		m[3] = p[3];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
			
 
				+	{
			
 
				+		m[0] = a;
			
 
				+		m[1] = b;
			
 
				+		m[2] = c;
			
 
				+		m[3] = d;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using vint4::zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(int a)
			
 
				+	{
			
 
				+		m[0] = a;
			
 
				+		m[1] = a;
			
 
				+		m[2] = a;
			
 
				+		m[3] = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE int lane() const
			
 
				+	{
			
 
				+		return m[l];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Set the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
			
 
				+	{
			
 
				+		m[l] = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 zero()
			
 
				+	{
			
 
				+		return vint4(0);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
			
 
				+	{
			
 
				+		return vint4(*p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
			
 
				+	{
			
 
				+		return vint4(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 lane_id()
			
 
				+	{
			
 
				+		return vint4(0, 1, 2, 3);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	int m[4];
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide control plane masks.
			
 
				+ */
			
 
				+struct vmask4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing mask value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(int* p)
			
 
				+	{
			
 
				+		m[0] = p[0];
			
 
				+		m[1] = p[1];
			
 
				+		m[2] = p[2];
			
 
				+		m[3] = p[3];
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
			
 
				+	{
			
 
				+		m[0] = a == false ? 0 : -1;
			
 
				+		m[1] = a == false ? 0 : -1;
			
 
				+		m[2] = a == false ? 0 : -1;
			
 
				+		m[3] = a == false ? 0 : -1;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
			
 
				+	{
			
 
				+		m[0] = a == false ? 0 : -1;
			
 
				+		m[1] = b == false ? 0 : -1;
			
 
				+		m[2] = c == false ? 0 : -1;
			
 
				+		m[3] = d == false ? 0 : -1;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	int m[4];
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask union (or).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] | b.m[0],
			
 
				+	              a.m[1] | b.m[1],
			
 
				+	              a.m[2] | b.m[2],
			
 
				+	              a.m[3] | b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask intersect (and).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] & b.m[0],
			
 
				+	              a.m[1] & b.m[1],
			
 
				+	              a.m[2] & b.m[2],
			
 
				+	              a.m[3] & b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask difference (xor).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] ^ b.m[0],
			
 
				+	              a.m[1] ^ b.m[1],
			
 
				+	              a.m[2] ^ b.m[2],
			
 
				+	              a.m[3] ^ b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask invert (not).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
			
 
				+{
			
 
				+	return vmask4(~a.m[0],
			
 
				+	              ~a.m[1],
			
 
				+	              ~a.m[2],
			
 
				+	              ~a.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a 1-bit mask code indicating mask status.
			
 
				+ *
			
 
				+ * bit0 = lane 0
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
			
 
				+{
			
 
				+	return ((a.m[0] >> 31) & 0x1) |
			
 
				+	       ((a.m[1] >> 30) & 0x2) |
			
 
				+	       ((a.m[2] >> 29) & 0x4) |
			
 
				+	       ((a.m[3] >> 28) & 0x8);
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(a.m[0] + b.m[0],
			
 
				+	             a.m[1] + b.m[1],
			
 
				+	             a.m[2] + b.m[2],
			
 
				+	             a.m[3] + b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(a.m[0] - b.m[0],
			
 
				+	             a.m[1] - b.m[1],
			
 
				+	             a.m[2] - b.m[2],
			
 
				+	             a.m[3] - b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(a.m[0] * b.m[0],
			
 
				+	             a.m[1] * b.m[1],
			
 
				+	             a.m[2] * b.m[2],
			
 
				+	             a.m[3] * b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector bit invert.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
			
 
				+{
			
 
				+	return vint4(~a.m[0],
			
 
				+	             ~a.m[1],
			
 
				+	             ~a.m[2],
			
 
				+	             ~a.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise or.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(a.m[0] | b.m[0],
			
 
				+	             a.m[1] | b.m[1],
			
 
				+	             a.m[2] | b.m[2],
			
 
				+	             a.m[3] | b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise and.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(a.m[0] & b.m[0],
			
 
				+	             a.m[1] & b.m[1],
			
 
				+	             a.m[2] & b.m[2],
			
 
				+	             a.m[3] & b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise xor.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(a.m[0] ^ b.m[0],
			
 
				+	             a.m[1] ^ b.m[1],
			
 
				+	             a.m[2] ^ b.m[2],
			
 
				+	             a.m[3] ^ b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] == b.m[0],
			
 
				+	              a.m[1] == b.m[1],
			
 
				+	              a.m[2] == b.m[2],
			
 
				+	              a.m[3] == b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] != b.m[0],
			
 
				+	              a.m[1] != b.m[1],
			
 
				+	              a.m[2] != b.m[2],
			
 
				+	              a.m[3] != b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] < b.m[0],
			
 
				+	              a.m[1] < b.m[1],
			
 
				+	              a.m[2] < b.m[2],
			
 
				+	              a.m[3] < b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] > b.m[0],
			
 
				+	              a.m[1] > b.m[1],
			
 
				+	              a.m[2] > b.m[2],
			
 
				+	              a.m[3] > b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift left.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
			
 
				+{
			
 
				+	return vint4(a.m[0] << s,
			
 
				+	             a.m[1] << s,
			
 
				+	             a.m[2] << s,
			
 
				+	             a.m[3] << s);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
			
 
				+{
			
 
				+	unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
			
 
				+	unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
			
 
				+	unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
			
 
				+	unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
			
 
				+
			
 
				+	return vint4(static_cast<int>(as0),
			
 
				+	             static_cast<int>(as1),
			
 
				+	             static_cast<int>(as2),
			
 
				+	             static_cast<int>(as3));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Arithmetic shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
			
 
				+{
			
 
				+	return vint4(a.m[0] >> s,
			
 
				+	             a.m[1] >> s,
			
 
				+	             a.m[2] >> s,
			
 
				+	             a.m[3] >> s);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
			
 
				+	             a.m[1] < b.m[1] ? a.m[1] : b.m[1],
			
 
				+	             a.m[2] < b.m[2] ? a.m[2] : b.m[2],
			
 
				+	             a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
			
 
				+	             a.m[1] > b.m[1] ? a.m[1] : b.m[1],
			
 
				+	             a.m[2] > b.m[2] ? a.m[2] : b.m[2],
			
 
				+	             a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a single vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
			
 
				+{
			
 
				+	int b = std::min(a.m[0], a.m[1]);
			
 
				+	int c = std::min(a.m[2], a.m[3]);
			
 
				+	return vint4(std::min(b, c));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a single vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
			
 
				+{
			
 
				+	int b = std::max(a.m[0], a.m[1]);
			
 
				+	int c = std::max(a.m[2], a.m[3]);
			
 
				+	return vint4(std::max(b, c));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of vector lanes as a scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
			
 
				+{
			
 
				+	return a.m[0] + a.m[1] + a.m[2] + a.m[3];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
			
 
				+{
			
 
				+	p[0] = a.m[0];
			
 
				+	p[1] = a.m[1];
			
 
				+	p[2] = a.m[2];
			
 
				+	p[3] = a.m[3];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
			
 
				+{
			
 
				+	p[0] = a.m[0];
			
 
				+	p[1] = a.m[1];
			
 
				+	p[2] = a.m[2];
			
 
				+	p[3] = a.m[3];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store lowest N (vector width) bytes into an unaligned address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
			
 
				+{
			
 
				+	int* pi = reinterpret_cast<int*>(p);
			
 
				+	*pi = a.m[0];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Gather N (vector width) indices from the array.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
			
 
				+{
			
 
				+	return vint4(base[indices.m[0]],
			
 
				+	             base[indices.m[1]],
			
 
				+	             base[indices.m[2]],
			
 
				+	             base[indices.m[3]]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
			
 
				+{
			
 
				+	int b0 = a.m[0] & 0xFF;
			
 
				+	int b1 = a.m[1] & 0xFF;
			
 
				+	int b2 = a.m[2] & 0xFF;
			
 
				+	int b3 = a.m[3] & 0xFF;
			
 
				+
			
 
				+	int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
			
 
				+	return vint4(b, 0, 0, 0);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
			
 
				+{
			
 
				+	return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
			
 
				+	             (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
			
 
				+	             (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
			
 
				+	             (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(a.m[0] + b.m[0],
			
 
				+	               a.m[1] + b.m[1],
			
 
				+	               a.m[2] + b.m[2],
			
 
				+	               a.m[3] + b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(a.m[0] - b.m[0],
			
 
				+	               a.m[1] - b.m[1],
			
 
				+	               a.m[2] - b.m[2],
			
 
				+	               a.m[3] - b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(a.m[0] * b.m[0],
			
 
				+	               a.m[1] * b.m[1],
			
 
				+	               a.m[2] * b.m[2],
			
 
				+	               a.m[3] * b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(a.m[0] / b.m[0],
			
 
				+	               a.m[1] / b.m[1],
			
 
				+	               a.m[2] / b.m[2],
			
 
				+	               a.m[3] / b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] == b.m[0],
			
 
				+	              a.m[1] == b.m[1],
			
 
				+	              a.m[2] == b.m[2],
			
 
				+	              a.m[3] == b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] != b.m[0],
			
 
				+	              a.m[1] != b.m[1],
			
 
				+	              a.m[2] != b.m[2],
			
 
				+	              a.m[3] != b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] < b.m[0],
			
 
				+	              a.m[1] < b.m[1],
			
 
				+	              a.m[2] < b.m[2],
			
 
				+	              a.m[3] < b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] > b.m[0],
			
 
				+	              a.m[1] > b.m[1],
			
 
				+	              a.m[2] > b.m[2],
			
 
				+	              a.m[3] > b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] <= b.m[0],
			
 
				+	              a.m[1] <= b.m[1],
			
 
				+	              a.m[2] <= b.m[2],
			
 
				+	              a.m[3] <= b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(a.m[0] >= b.m[0],
			
 
				+	              a.m[1] >= b.m[1],
			
 
				+	              a.m[2] >= b.m[2],
			
 
				+	              a.m[3] >= b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
			
 
				+	               a.m[1] < b.m[1] ? a.m[1] : b.m[1],
			
 
				+	               a.m[2] < b.m[2] ? a.m[2] : b.m[2],
			
 
				+	               a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
			
 
				+	               a.m[1] > b.m[1] ? a.m[1] : b.m[1],
			
 
				+	               a.m[2] > b.m[2] ? a.m[2] : b.m[2],
			
 
				+	               a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the absolute value of the float vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
			
 
				+{
			
 
				+	return vfloat4(std::abs(a.m[0]),
			
 
				+	               std::abs(a.m[1]),
			
 
				+	               std::abs(a.m[2]),
			
 
				+	               std::abs(a.m[3]));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float rounded to the nearest integer value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
			
 
				+{
			
 
				+	assert(std::fegetround() == FE_TONEAREST);
			
 
				+	return vfloat4(std::nearbyint(a.m[0]),
			
 
				+	               std::nearbyint(a.m[1]),
			
 
				+	               std::nearbyint(a.m[2]),
			
 
				+	               std::nearbyint(a.m[3]));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
			
 
				+{
			
 
				+	float tmp1 = std::min(a.m[0], a.m[1]);
			
 
				+	float tmp2 = std::min(a.m[2], a.m[3]);
			
 
				+	return vfloat4(std::min(tmp1, tmp2));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
			
 
				+{
			
 
				+	float tmp1 = std::max(a.m[0], a.m[1]);
			
 
				+	float tmp2 = std::max(a.m[2], a.m[3]);
			
 
				+	return vfloat4(std::max(tmp1, tmp2));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
			
 
				+{
			
 
				+	// Use halving add, gives invariance with SIMD versions
			
 
				+	return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the sqrt of the lanes in the vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
			
 
				+{
			
 
				+	return vfloat4(std::sqrt(a.m[0]),
			
 
				+	               std::sqrt(a.m[1]),
			
 
				+	               std::sqrt(a.m[2]),
			
 
				+	               std::sqrt(a.m[3]));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				+{
			
 
				+	return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
			
 
				+	               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
			
 
				+	               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
			
 
				+	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				+{
			
 
				+	return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
			
 
				+	               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
			
 
				+	               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
			
 
				+	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array;
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
			
 
				+{
			
 
				+	return vfloat4(base[indices.m[0]],
			
 
				+	               base[indices.m[1]],
			
 
				+	               base[indices.m[2]],
			
 
				+	               base[indices.m[3]]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr)
			
 
				+{
			
 
				+	ptr[0] = a.m[0];
			
 
				+	ptr[1] = a.m[1];
			
 
				+	ptr[2] = a.m[2];
			
 
				+	ptr[3] = a.m[3];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)
			
 
				+{
			
 
				+	ptr[0] = a.m[0];
			
 
				+	ptr[1] = a.m[1];
			
 
				+	ptr[2] = a.m[2];
			
 
				+	ptr[3] = a.m[3];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using truncation.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
			
 
				+{
			
 
				+	return vint4(static_cast<int>(a.m[0]),
			
 
				+	             static_cast<int>(a.m[1]),
			
 
				+	             static_cast<int>(a.m[2]),
			
 
				+	             static_cast<int>(a.m[3]));
			
 
				+}
			
 
				+
			
 
				+/**f
			
 
				+ * @brief Return a integer value for a float vector, using round-to-nearest.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
			
 
				+{
			
 
				+	return vint4(static_cast<int>(a.m[0] + 0.5f),
			
 
				+	             static_cast<int>(a.m[1] + 0.5f),
			
 
				+	             static_cast<int>(a.m[2] + 0.5f),
			
 
				+	             static_cast<int>(a.m[3] + 0.5f));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for a integer vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
			
 
				+{
			
 
				+	return vfloat4(static_cast<float>(a.m[0]),
			
 
				+	               static_cast<float>(a.m[1]),
			
 
				+	               static_cast<float>(a.m[2]),
			
 
				+	               static_cast<float>(a.m[3]));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float16 value for a float vector, using round-to-nearest.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
			
 
				+{
			
 
				+	return vint4(
			
 
				+		float_to_sf16(a.lane<0>()),
			
 
				+		float_to_sf16(a.lane<1>()),
			
 
				+		float_to_sf16(a.lane<2>()),
			
 
				+		float_to_sf16(a.lane<3>()));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float16 value for a float scalar, using round-to-nearest.
			
 
				+ */
			
 
				+static inline uint16_t float_to_float16(float a)
			
 
				+{
			
 
				+	return float_to_sf16(a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for a float16 vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
			
 
				+{
			
 
				+	return vfloat4(
			
 
				+		sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
			
 
				+		sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
			
 
				+		sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
			
 
				+		sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for a float16 scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
			
 
				+{
			
 
				+	return sf16_to_float(a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the first half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
			
 
				+{
			
 
				+	vint4 r;
			
 
				+	memcpy(r.m, a.m, 4 * 4);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the second half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
			
 
				+{
			
 
				+	vfloat4 r;
			
 
				+	memcpy(r.m, a.m, 4 * 4);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
			
 
				+{
			
 
				+	t0p = t0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
			
 
				+{
			
 
				+	t0p = t0;
			
 
				+	t1p = t1;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
			
 
				+	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
			
 
				+{
			
 
				+	t0p = t0;
			
 
				+	t1p = t1;
			
 
				+	t2p = t2;
			
 
				+	t3p = t3;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
			
 
				+{
			
 
				+	uint8_t table[16];
			
 
				+	storea(t0, reinterpret_cast<int*>(table +  0));
			
 
				+
			
 
				+	return vint4(table[idx.lane<0>()],
			
 
				+	             table[idx.lane<1>()],
			
 
				+	             table[idx.lane<2>()],
			
 
				+	             table[idx.lane<3>()]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
			
 
				+{
			
 
				+	uint8_t table[32];
			
 
				+	storea(t0, reinterpret_cast<int*>(table +  0));
			
 
				+	storea(t1, reinterpret_cast<int*>(table + 16));
			
 
				+
			
 
				+	return vint4(table[idx.lane<0>()],
			
 
				+	             table[idx.lane<1>()],
			
 
				+	             table[idx.lane<2>()],
			
 
				+	             table[idx.lane<3>()]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
			
 
				+{
			
 
				+	uint8_t table[64];
			
 
				+	storea(t0, reinterpret_cast<int*>(table +  0));
			
 
				+	storea(t1, reinterpret_cast<int*>(table + 16));
			
 
				+	storea(t2, reinterpret_cast<int*>(table + 32));
			
 
				+	storea(t3, reinterpret_cast<int*>(table + 48));
			
 
				+
			
 
				+	return vint4(table[idx.lane<0>()],
			
 
				+	             table[idx.lane<1>()],
			
 
				+	             table[idx.lane<2>()],
			
 
				+	             table[idx.lane<3>()]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a vector of interleaved RGBA data.
			
 
				+ *
			
 
				+ * Input vectors have the value stored in the bottom 8 bits of each lane,
			
 
				+ * with high  bits set to zero.
			
 
				+ *
			
 
				+ * Output vector stores a single RGBA texel packed in each lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
			
 
				+{
			
 
				+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector, skipping masked lanes.
			
 
				+ *
			
 
				+ * All masked lanes must be at the end of vector, after all non-masked lanes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
			
 
				+{
			
 
				+	if (mask.m[3])
			
 
				+	{
			
 
				+		store(data, base);
			
 
				+	}
			
 
				+	else if (mask.m[2])
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+		base[1] = data.lane<1>();
			
 
				+		base[2] = data.lane<2>();
			
 
				+	}
			
 
				+	else if (mask.m[1])
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+		base[1] = data.lane<1>();
			
 
				+	}
			
 
				+	else if (mask.m[0])
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
@@ -0,0 +1,1283 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2019-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief 4x32-bit vectors, implemented using SSE.
			
 
				+ *
			
 
				+ * This module implements 4-wide 32-bit float, int, and mask vectors for x86
			
 
				+ * SSE. The implementation requires at least SSE2, but higher levels of SSE can
			
 
				+ * be selected at compile time to improve performance.
			
 
				+ *
			
 
				+ * There is a baseline level of functionality provided by all vector widths and
			
 
				+ * implementations. This is implemented using identical function signatures,
			
 
				+ * modulo data type, so we can use them as substitutable implementations in VLA
			
 
				+ * code.
			
 
				+ *
			
 
				+ * The 4-wide vectors are also used as a fixed-width type, and significantly
			
 
				+ * extend the functionality above that available to VLA code.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED
			
 
				+#define ASTC_VECMATHLIB_SSE_4_H_INCLUDED
			
 
				+
			
 
				+#ifndef ASTCENC_SIMD_INLINE
			
 
				+	#error "Include astcenc_vecmathlib.h, do not include directly"
			
 
				+#endif
			
 
				+
			
 
				+#include <cstdio>
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide floats.
			
 
				+ */
			
 
				+struct vfloat4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vfloat4() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with vectors if data is aligned
			
 
				+	 * to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
			
 
				+	{
			
 
				+		m = _mm_loadu_ps(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
			
 
				+	{
			
 
				+		m = _mm_set1_ps(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
			
 
				+	{
			
 
				+		m = _mm_set_ps(d, c, b, a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat4(__m128 a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				+	{
			
 
				+		return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Set the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
			
 
				+	{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+		__m128 v = _mm_set1_ps(a);
			
 
				+		m = _mm_insert_ps(m, v, l << 6 | l << 4);
			
 
				+#else
			
 
				+		alignas(16) float idx[4];
			
 
				+		_mm_store_ps(idx, m);
			
 
				+		idx[l] = a;
			
 
				+		m = _mm_load_ps(idx);
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 zero()
			
 
				+	{
			
 
				+		return vfloat4(_mm_setzero_ps());
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
			
 
				+	{
			
 
				+		return vfloat4(_mm_load_ps1(p));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
			
 
				+	{
			
 
				+		return vfloat4(_mm_load_ps(p));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
			
 
				+	{
			
 
				+		return vfloat4(_mm_set_ps(3, 2, 1, 0));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 2.
			
 
				+	 */
			
 
				+	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		vfloat4 result(_mm_shuffle_ps(m, m, l0 | l1 << 2));
			
 
				+		result.set_lane<2>(0.0f);
			
 
				+		result.set_lane<3>(0.0f);
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 3.
			
 
				+	 */
			
 
				+	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		vfloat4 result(_mm_shuffle_ps(m, m, l0 | l1 << 2 | l2 << 4));
			
 
				+		result.set_lane<3>(0.0f);
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Return a swizzled float 4.
			
 
				+	 */
			
 
				+	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
			
 
				+	{
			
 
				+		return vfloat4(_mm_shuffle_ps(m, m, l0 | l1 << 2 | l2 << 4 | l3 << 6));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	__m128 m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide ints.
			
 
				+ */
			
 
				+struct vint4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vint4() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with vectors if data is aligned
			
 
				+	 * to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
			
 
				+	{
			
 
				+		m = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
			
 
				+	{
			
 
				+		// _mm_loadu_si32 would be nicer syntax, but missing on older GCC
			
 
				+		__m128i t = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(p));
			
 
				+
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+		m = _mm_cvtepu8_epi32(t);
			
 
				+#else
			
 
				+		t = _mm_unpacklo_epi8(t, _mm_setzero_si128());
			
 
				+		m = _mm_unpacklo_epi16(t, _mm_setzero_si128());
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using vfloat4::zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(int a)
			
 
				+	{
			
 
				+		m = _mm_set1_epi32(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
			
 
				+	{
			
 
				+		m = _mm_set_epi32(d, c, b, a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint4(__m128i a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar from a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE int lane() const
			
 
				+	{
			
 
				+		return _mm_cvtsi128_si32(_mm_shuffle_epi32(m, l));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Set the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
			
 
				+	{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+		m = _mm_insert_epi32(m, a, l);
			
 
				+#else
			
 
				+		alignas(16) int idx[4];
			
 
				+		_mm_store_si128(reinterpret_cast<__m128i*>(idx), m);
			
 
				+		idx[l] = a;
			
 
				+		m = _mm_load_si128(reinterpret_cast<const __m128i*>(idx));
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 zero()
			
 
				+	{
			
 
				+		return vint4(_mm_setzero_si128());
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
			
 
				+	{
			
 
				+		return vint4(*p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
			
 
				+	{
			
 
				+		return vint4(_mm_load_si128(reinterpret_cast<const __m128i*>(p)));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint4 lane_id()
			
 
				+	{
			
 
				+		return vint4(_mm_set_epi32(3, 2, 1, 0));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	__m128i m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask4 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 4-wide control plane masks.
			
 
				+ */
			
 
				+struct vmask4
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(__m128 a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(__m128i a)
			
 
				+	{
			
 
				+		m = _mm_castsi128_ps(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
			
 
				+	{
			
 
				+		vint4 mask(a == false ? 0 : -1);
			
 
				+		m = _mm_castsi128_ps(mask.m);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 4 scalar values.
			
 
				+	 *
			
 
				+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
			
 
				+	{
			
 
				+		vint4 mask(a == false ? 0 : -1,
			
 
				+		           b == false ? 0 : -1,
			
 
				+		           c == false ? 0 : -1,
			
 
				+		           d == false ? 0 : -1);
			
 
				+
			
 
				+		m = _mm_castsi128_ps(mask.m);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Get the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				+	{
			
 
				+		return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	__m128 m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask union (or).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_or_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask intersect (and).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_and_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask difference (xor).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_xor_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask invert (not).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
			
 
				+{
			
 
				+	return vmask4(_mm_xor_si128(_mm_castps_si128(a.m), _mm_set1_epi32(-1)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a 4-bit mask code indicating mask status.
			
 
				+ *
			
 
				+ * bit0 = lane 0
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
			
 
				+{
			
 
				+	return static_cast<unsigned int>(_mm_movemask_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(_mm_add_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(_mm_sub_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	return vint4(_mm_mullo_epi32 (a.m, b.m));
			
 
				+#else
			
 
				+	__m128i t1 = _mm_mul_epu32(a.m, b.m);
			
 
				+	__m128i t2 = _mm_mul_epu32(
			
 
				+	                 _mm_srli_si128(a.m, 4),
			
 
				+	                 _mm_srli_si128(b.m, 4));
			
 
				+	__m128i r =  _mm_unpacklo_epi32(
			
 
				+	                 _mm_shuffle_epi32(t1, _MM_SHUFFLE (0, 0, 2, 0)),
			
 
				+	                 _mm_shuffle_epi32(t2, _MM_SHUFFLE (0, 0, 2, 0)));
			
 
				+	return vint4(r);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector bit invert.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
			
 
				+{
			
 
				+	return vint4(_mm_xor_si128(a.m, _mm_set1_epi32(-1)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise or.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(_mm_or_si128(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise and.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(_mm_and_si128(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise xor.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vint4(_mm_xor_si128(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmpeq_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return ~vmask4(_mm_cmpeq_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmplt_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmpgt_epi32(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift left.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
			
 
				+{
			
 
				+	return vint4(_mm_slli_epi32(a.m, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
			
 
				+{
			
 
				+	return vint4(_mm_srli_epi32(a.m, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Arithmetic shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
			
 
				+{
			
 
				+	return vint4(_mm_srai_epi32(a.m, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	return vint4(_mm_min_epi32(a.m, b.m));
			
 
				+#else
			
 
				+	vmask4 d = a < b;
			
 
				+	__m128i ap = _mm_and_si128(_mm_castps_si128(d.m), a.m);
			
 
				+	__m128i bp = _mm_andnot_si128(_mm_castps_si128(d.m), b.m);
			
 
				+	return vint4(_mm_or_si128(ap,bp));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	return vint4(_mm_max_epi32(a.m, b.m));
			
 
				+#else
			
 
				+	vmask4 d = a > b;
			
 
				+	__m128i ap = _mm_and_si128(_mm_castps_si128(d.m), a.m);
			
 
				+	__m128i bp = _mm_andnot_si128(_mm_castps_si128(d.m), b.m);
			
 
				+	return vint4(_mm_or_si128(ap,bp));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
			
 
				+{
			
 
				+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
			
 
				+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
			
 
				+	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
			
 
				+{
			
 
				+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
			
 
				+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
			
 
				+	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of a vector as a scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
			
 
				+{
			
 
				+	// Add top and bottom halves, lane 1/0
			
 
				+	__m128i fold = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(a.m),
			
 
				+	                                              _mm_castsi128_ps(a.m)));
			
 
				+	__m128i t = _mm_add_epi32(a.m, fold);
			
 
				+
			
 
				+	// Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
			
 
				+	t = _mm_add_epi32(t, _mm_shuffle_epi32(t, 0x55));
			
 
				+
			
 
				+	return _mm_cvtsi128_si32(t);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to a 16B aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
			
 
				+{
			
 
				+	_mm_store_si128(reinterpret_cast<__m128i*>(p), a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
			
 
				+{
			
 
				+	// Cast due to missing intrinsics
			
 
				+	_mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store lowest N (vector width) bytes into an unaligned address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
			
 
				+{
			
 
				+	// Cast due to missing intrinsics
			
 
				+	_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Gather N (vector width) indices from the array.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
			
 
				+{
			
 
				+#if ASTCENC_AVX >= 2
			
 
				+	return vint4(_mm_i32gather_epi32(base, indices.m, 4));
			
 
				+#else
			
 
				+	alignas(16) int idx[4];
			
 
				+	storea(indices, idx);
			
 
				+	return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
			
 
				+	return vint4(_mm_shuffle_epi8(a.m, shuf));
			
 
				+#else
			
 
				+	__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
			
 
				+	__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
			
 
				+	return vint4(_mm_unpacklo_epi16(va, vb));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
			
 
				+{
			
 
				+	__m128i condi = _mm_castps_si128(cond.m);
			
 
				+
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	return vint4(_mm_blendv_epi8(a.m, b.m, condi));
			
 
				+#else
			
 
				+	return vint4(_mm_or_si128(_mm_and_si128(condi, b.m), _mm_andnot_si128(condi, a.m)));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat4 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(_mm_add_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(_mm_sub_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(_mm_mul_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(_mm_div_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmpeq_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmpneq_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmplt_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmpgt_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmple_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vmask4(_mm_cmpge_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	// Do not reorder - second operand will return if either is NaN
			
 
				+	return vfloat4(_mm_min_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, @c b will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	// Do not reorder - second operand will return if either is NaN
			
 
				+	return vfloat4(_mm_max_ps(a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the absolute value of the float vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
			
 
				+{
			
 
				+	return vfloat4(_mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a.m), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float rounded to the nearest integer value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
			
 
				+	return vfloat4(_mm_round_ps(a.m, flags));
			
 
				+#else
			
 
				+	__m128 v = a.m;
			
 
				+	__m128 neg_zero = _mm_castsi128_ps(_mm_set1_epi32(static_cast<int>(0x80000000)));
			
 
				+	__m128 no_fraction = _mm_set1_ps(8388608.0f);
			
 
				+	__m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
			
 
				+	__m128 sign = _mm_and_ps(v, neg_zero);
			
 
				+	__m128 s_magic = _mm_or_ps(no_fraction, sign);
			
 
				+	__m128 r1 = _mm_add_ps(v, s_magic);
			
 
				+	r1 = _mm_sub_ps(r1, s_magic);
			
 
				+	__m128 r2 = _mm_and_ps(v, abs_mask);
			
 
				+	__m128 mask = _mm_cmple_ps(r2, no_fraction);
			
 
				+	r2 = _mm_andnot_ps(mask, v);
			
 
				+	r1 = _mm_and_ps(r1, mask);
			
 
				+	return vfloat4(_mm_xor_ps(r1, r2));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
			
 
				+{
			
 
				+	a = min(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 3, 2))));
			
 
				+	a = min(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 1))));
			
 
				+	return vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 0)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
			
 
				+{
			
 
				+	a = max(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 3, 2))));
			
 
				+	a = max(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 1))));
			
 
				+	return vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 0)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of a vector as a scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
			
 
				+{
			
 
				+	// Add top and bottom halves, lane 1/0
			
 
				+	__m128 t = _mm_add_ps(a.m, _mm_movehl_ps(a.m, a.m));
			
 
				+
			
 
				+	// Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
			
 
				+	t = _mm_add_ss(t, _mm_shuffle_ps(t, t, 0x55));
			
 
				+
			
 
				+	return _mm_cvtss_f32(t);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the sqrt of the lanes in the vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
			
 
				+{
			
 
				+	return vfloat4(_mm_sqrt_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
			
 
				+#else
			
 
				+	return vfloat4(_mm_or_ps(_mm_and_ps(cond.m, b.m), _mm_andnot_ps(cond.m, a.m)));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
			
 
				+#else
			
 
				+	__m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
			
 
				+	return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array;
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
			
 
				+{
			
 
				+#if ASTCENC_AVX >= 2
			
 
				+	return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
			
 
				+#else
			
 
				+	alignas(16) int idx[4];
			
 
				+	storea(indices, idx);
			
 
				+	return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
			
 
				+{
			
 
				+	_mm_storeu_ps(p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to a 16B aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
			
 
				+{
			
 
				+	_mm_store_ps(p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using truncation.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
			
 
				+{
			
 
				+	return vint4(_mm_cvttps_epi32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using round-to-nearest.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
			
 
				+{
			
 
				+	a = round(a);
			
 
				+	return vint4(_mm_cvttps_epi32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for an integer vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
			
 
				+{
			
 
				+	return vfloat4(_mm_cvtepi32_ps(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float16 value for a float vector, using round-to-nearest.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
			
 
				+{
			
 
				+#if ASTCENC_F16C >= 1
			
 
				+	__m128i packedf16 = _mm_cvtps_ph(a.m, 0);
			
 
				+	__m128i f16 = _mm_cvtepu16_epi32(packedf16);
			
 
				+	return vint4(f16);
			
 
				+#else
			
 
				+	return vint4(
			
 
				+		float_to_sf16(a.lane<0>()),
			
 
				+		float_to_sf16(a.lane<1>()),
			
 
				+		float_to_sf16(a.lane<2>()),
			
 
				+		float_to_sf16(a.lane<3>()));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float16 value for a float scalar, using round-to-nearest.
			
 
				+ */
			
 
				+static inline uint16_t float_to_float16(float a)
			
 
				+{
			
 
				+#if ASTCENC_F16C >= 1
			
 
				+	__m128i f16 = _mm_cvtps_ph(_mm_set1_ps(a), 0);
			
 
				+	return  static_cast<uint16_t>(_mm_cvtsi128_si32(f16));
			
 
				+#else
			
 
				+	return float_to_sf16(a);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for a float16 vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
			
 
				+{
			
 
				+#if ASTCENC_F16C >= 1
			
 
				+	__m128i packed = _mm_packs_epi32(a.m, a.m);
			
 
				+	__m128 f32 = _mm_cvtph_ps(packed);
			
 
				+	return vfloat4(f32);
			
 
				+#else
			
 
				+	return vfloat4(
			
 
				+		sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
			
 
				+		sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
			
 
				+		sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
			
 
				+		sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for a float16 scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
			
 
				+{
			
 
				+#if ASTCENC_F16C >= 1
			
 
				+	__m128i packed = _mm_set1_epi16(static_cast<short>(a));
			
 
				+	__m128 f32 = _mm_cvtph_ps(packed);
			
 
				+	return _mm_cvtss_f32(f32);
			
 
				+#else
			
 
				+	return sf16_to_float(a);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the first half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
			
 
				+{
			
 
				+	return vint4(_mm_castps_si128(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the second half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
			
 
				+{
			
 
				+	return vfloat4(_mm_castsi128_ps(v.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
			
 
				+{
			
 
				+	t0p = t0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 30
			
 
				+	t0p = t0;
			
 
				+	t1p = t0 ^ t1;
			
 
				+#else
			
 
				+	t0p = t0;
			
 
				+	t1p = t1;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
			
 
				+	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 30
			
 
				+	t0p = t0;
			
 
				+	t1p = t0 ^ t1;
			
 
				+	t2p = t1 ^ t2;
			
 
				+	t3p = t2 ^ t3;
			
 
				+#else
			
 
				+	t0p = t0;
			
 
				+	t1p = t1;
			
 
				+	t2p = t2;
			
 
				+	t3p = t3;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 30
			
 
				+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				+	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				+
			
 
				+	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
			
 
				+	return vint4(result);
			
 
				+#else
			
 
				+	alignas(ASTCENC_VECALIGN) uint8_t table[16];
			
 
				+	storea(t0, reinterpret_cast<int*>(table +  0));
			
 
				+
			
 
				+	return vint4(table[idx.lane<0>()],
			
 
				+	             table[idx.lane<1>()],
			
 
				+	             table[idx.lane<2>()],
			
 
				+	             table[idx.lane<3>()]);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 30
			
 
				+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				+	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				+
			
 
				+	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
			
 
				+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
			
 
				+
			
 
				+	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
			
 
				+	result = _mm_xor_si128(result, result2);
			
 
				+
			
 
				+	return vint4(result);
			
 
				+#else
			
 
				+	alignas(ASTCENC_VECALIGN) uint8_t table[32];
			
 
				+	storea(t0, reinterpret_cast<int*>(table +  0));
			
 
				+	storea(t1, reinterpret_cast<int*>(table + 16));
			
 
				+
			
 
				+	return vint4(table[idx.lane<0>()],
			
 
				+	             table[idx.lane<1>()],
			
 
				+	             table[idx.lane<2>()],
			
 
				+	             table[idx.lane<3>()]);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
			
 
				+{
			
 
				+#if ASTCENC_SSE >= 30
			
 
				+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				+	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				+
			
 
				+	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
			
 
				+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
			
 
				+
			
 
				+	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
			
 
				+	result = _mm_xor_si128(result, result2);
			
 
				+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
			
 
				+
			
 
				+	result2 = _mm_shuffle_epi8(t2.m, idxx);
			
 
				+	result = _mm_xor_si128(result, result2);
			
 
				+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
			
 
				+
			
 
				+	result2 = _mm_shuffle_epi8(t3.m, idxx);
			
 
				+	result = _mm_xor_si128(result, result2);
			
 
				+
			
 
				+	return vint4(result);
			
 
				+#else
			
 
				+	alignas(ASTCENC_VECALIGN) uint8_t table[64];
			
 
				+	storea(t0, reinterpret_cast<int*>(table +  0));
			
 
				+	storea(t1, reinterpret_cast<int*>(table + 16));
			
 
				+	storea(t2, reinterpret_cast<int*>(table + 32));
			
 
				+	storea(t3, reinterpret_cast<int*>(table + 48));
			
 
				+
			
 
				+	return vint4(table[idx.lane<0>()],
			
 
				+	             table[idx.lane<1>()],
			
 
				+	             table[idx.lane<2>()],
			
 
				+	             table[idx.lane<3>()]);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a vector of interleaved RGBA data.
			
 
				+ *
			
 
				+ * Input vectors have the value stored in the bottom 8 bits of each lane,
			
 
				+ * with high  bits set to zero.
			
 
				+ *
			
 
				+ * Output vector stores a single RGBA texel packed in each lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
			
 
				+{
			
 
				+// Workaround an XCode compiler internal fault; note is slower than slli_epi32
			
 
				+// so we should revert this when we get the opportunity
			
 
				+#if defined(__APPLE__)
			
 
				+	__m128i value = r.m;
			
 
				+	value = _mm_add_epi32(value, _mm_bslli_si128(g.m, 1));
			
 
				+	value = _mm_add_epi32(value, _mm_bslli_si128(b.m, 2));
			
 
				+	value = _mm_add_epi32(value, _mm_bslli_si128(a.m, 3));
			
 
				+	return vint4(value);
			
 
				+#else
			
 
				+	__m128i value = r.m;
			
 
				+	value = _mm_add_epi32(value, _mm_slli_epi32(g.m,  8));
			
 
				+	value = _mm_add_epi32(value, _mm_slli_epi32(b.m, 16));
			
 
				+	value = _mm_add_epi32(value, _mm_slli_epi32(a.m, 24));
			
 
				+	return vint4(value);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector, skipping masked lanes.
			
 
				+ *
			
 
				+ * All masked lanes must be at the end of vector, after all non-masked lanes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
			
 
				+{
			
 
				+#if ASTCENC_AVX >= 2
			
 
				+	_mm_maskstore_epi32(base, _mm_castps_si128(mask.m), data.m);
			
 
				+#else
			
 
				+	// Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee
			
 
				+	// fault suppression on masked lanes so we can get page faults at the end of an image.
			
 
				+	if (mask.lane<3>() != 0.0f)
			
 
				+	{
			
 
				+		store(data, base);
			
 
				+	}
			
 
				+	else if (mask.lane<2>() != 0.0f)
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+		base[1] = data.lane<1>();
			
 
				+		base[2] = data.lane<2>();
			
 
				+	}
			
 
				+	else if (mask.lane<1>() != 0.0f)
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+		base[1] = data.lane<1>();
			
 
				+	}
			
 
				+	else if (mask.lane<0>() != 0.0f)
			
 
				+	{
			
 
				+		base[0] = data.lane<0>();
			
 
				+	}
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)
			
 
				+
			
 
				+#define ASTCENC_USE_NATIVE_DOT_PRODUCT 1
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the dot product for the full 4 lanes, returning scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0xFF));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the dot product for the full 4 lanes, returning vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(_mm_dp_ps(a.m, b.m, 0xFF));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the dot product for the bottom 3 lanes, returning scalar.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0x77));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the dot product for the bottom 3 lanes, returning vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
			
 
				+{
			
 
				+	return vfloat4(_mm_dp_ps(a.m, b.m, 0x77));
			
 
				+}
			
 
				+
			
 
				+#endif // #if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)
			
 
				+
			
 
				+#if ASTCENC_POPCNT >= 1
			
 
				+
			
 
				+#define ASTCENC_USE_NATIVE_POPCOUNT 1
			
 
				+
			
 
				+/**
			
 
				+ * @brief Population bit count.
			
 
				+ *
			
 
				+ * @param v   The value to population count.
			
 
				+ *
			
 
				+ * @return The number of 1 bits.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int popcount(uint64_t v)
			
 
				+{
			
 
				+	return static_cast<int>(_mm_popcnt_u64(v));
			
 
				+}
			
 
				+
			
 
				+#endif // ASTCENC_POPCNT >= 1
			
 
				+
			
 
				+#endif // #ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED
			
--- a/thirdparty/astcenc/astcenc_weight_align.cpp
+++ b/thirdparty/astcenc/astcenc_weight_align.cpp
@@ -0,0 +1,479 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2022 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+#if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+
			
 
				+/**
			
 
				+ * @brief Functions for angular-sum algorithm for weight alignment.
			
 
				+ *
			
 
				+ * This algorithm works as follows:
			
 
				+ * - we compute a complex number P as (cos s*i, sin s*i) for each weight,
			
 
				+ *   where i is the input value and s is a scaling factor based on the spacing between the weights.
			
 
				+ * - we then add together complex numbers for all the weights.
			
 
				+ * - we then compute the length and angle of the resulting sum.
			
 
				+ *
			
 
				+ * This should produce the following results:
			
 
				+ * - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
			
 
				+ * - even distribution results in a vector of length 0.
			
 
				+ * - all samples identical results in perfect alignment for every scaling.
			
 
				+ *
			
 
				+ * For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
			
 
				+ * should then result in some scalings standing out as having particularly good alignment factors;
			
 
				+ * we can use this to produce a set of candidate scale/shift values for various quantization levels;
			
 
				+ * we should then actually try them and see what happens.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+#include "astcenc_vecmathlib.h"
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <cassert>
			
 
				+#include <cstring>
			
 
				+
			
 
				+static constexpr unsigned int ANGULAR_STEPS { 32 };
			
 
				+
			
 
				+static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
			
 
				+              "ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
			
 
				+
			
 
				+static_assert(ANGULAR_STEPS >= 32,
			
 
				+              "ANGULAR_STEPS must be at least max(steps_for_quant_level)");
			
 
				+
			
 
				+// Store a reduced sin/cos table for 64 possible weight values; this causes
			
 
				+// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
			
 
				+static constexpr unsigned int SINCOS_STEPS { 64 };
			
 
				+
			
 
				+static const uint8_t steps_for_quant_level[12] {
			
 
				+	2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
			
 
				+};
			
 
				+
			
 
				+alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
			
 
				+alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
			
 
				+
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+	static bool print_once { true };
			
 
				+#endif
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void prepare_angular_tables()
			
 
				+{
			
 
				+	for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
			
 
				+	{
			
 
				+		float angle_step = static_cast<float>(i + 1);
			
 
				+
			
 
				+		for (unsigned int j = 0; j < SINCOS_STEPS; j++)
			
 
				+		{
			
 
				+			sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
			
 
				+			cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the angular alignment factors and offsets.
			
 
				+ *
			
 
				+ * @param      weight_count              The number of (decimated) weights.
			
 
				+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
			
 
				+ * @param      max_angular_steps         The maximum number of steps to be tested.
			
 
				+ * @param[out] offsets                   The output angular offsets array.
			
 
				+ */
			
 
				+static void compute_angular_offsets(
			
 
				+	unsigned int weight_count,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	unsigned int max_angular_steps,
			
 
				+	float* offsets
			
 
				+) {
			
 
				+	promise(weight_count > 0);
			
 
				+	promise(max_angular_steps > 0);
			
 
				+
			
 
				+	alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
			
 
				+
			
 
				+	// Precompute isample; arrays are always allocated 64 elements long
			
 
				+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		// Add 2^23 and interpreting bits extracts round-to-nearest int
			
 
				+		vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
			
 
				+		vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
			
 
				+		storea(isample, isamplev + i);
			
 
				+	}
			
 
				+
			
 
				+	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
			
 
				+	vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
			
 
				+
			
 
				+	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		vfloat anglesum_x = vfloat::zero();
			
 
				+		vfloat anglesum_y = vfloat::zero();
			
 
				+
			
 
				+		for (unsigned int j = 0; j < weight_count; j++)
			
 
				+		{
			
 
				+			int isample = isamplev[j];
			
 
				+			anglesum_x += loada(cos_table[isample] + i);
			
 
				+			anglesum_y += loada(sin_table[isample] + i);
			
 
				+		}
			
 
				+
			
 
				+		vfloat angle = atan2(anglesum_y, anglesum_x);
			
 
				+		vfloat ofs = angle * mult;
			
 
				+		storea(ofs, offsets + i);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief For a given step size compute the lowest and highest weight.
			
 
				+ *
			
 
				+ * Compute the lowest and highest weight that results from quantizing using the given stepsize and
			
 
				+ * offset, and then compute the resulting error. The cut errors indicate the error that results from
			
 
				+ * forcing samples that should have had one weight value one step up or down.
			
 
				+ *
			
 
				+ * @param      weight_count              The number of (decimated) weights.
			
 
				+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
			
 
				+ * @param      max_angular_steps         The maximum number of steps to be tested.
			
 
				+ * @param      max_quant_steps           The maximum quantization level to be tested.
			
 
				+ * @param      offsets                   The angular offsets array.
			
 
				+ * @param[out] lowest_weight             Per angular step, the lowest weight.
			
 
				+ * @param[out] weight_span               Per angular step, the span between lowest and highest weight.
			
 
				+ * @param[out] error                     Per angular step, the error.
			
 
				+ * @param[out] cut_low_weight_error      Per angular step, the low weight cut error.
			
 
				+ * @param[out] cut_high_weight_error     Per angular step, the high weight cut error.
			
 
				+ */
			
 
				+static void compute_lowest_and_highest_weight(
			
 
				+	unsigned int weight_count,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	unsigned int max_angular_steps,
			
 
				+	unsigned int max_quant_steps,
			
 
				+	const float* offsets,
			
 
				+	float* lowest_weight,
			
 
				+	int* weight_span,
			
 
				+	float* error,
			
 
				+	float* cut_low_weight_error,
			
 
				+	float* cut_high_weight_error
			
 
				+) {
			
 
				+	promise(weight_count > 0);
			
 
				+	promise(max_angular_steps > 0);
			
 
				+
			
 
				+	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
			
 
				+
			
 
				+	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
			
 
				+	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		vfloat minidx(128.0f);
			
 
				+		vfloat maxidx(-128.0f);
			
 
				+		vfloat errval = vfloat::zero();
			
 
				+		vfloat cut_low_weight_err = vfloat::zero();
			
 
				+		vfloat cut_high_weight_err = vfloat::zero();
			
 
				+		vfloat offset = loada(offsets + sp);
			
 
				+
			
 
				+		for (unsigned int j = 0; j < weight_count; j++)
			
 
				+		{
			
 
				+			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
			
 
				+			vfloat svalrte = round(sval);
			
 
				+			vfloat diff = sval - svalrte;
			
 
				+			errval += diff * diff;
			
 
				+
			
 
				+			// Reset tracker on min hit
			
 
				+			vmask mask = svalrte < minidx;
			
 
				+			minidx = select(minidx, svalrte, mask);
			
 
				+			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
			
 
				+
			
 
				+			// Accumulate on min hit
			
 
				+			mask = svalrte == minidx;
			
 
				+			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
			
 
				+			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
			
 
				+
			
 
				+			// Reset tracker on max hit
			
 
				+			mask = svalrte > maxidx;
			
 
				+			maxidx = select(maxidx, svalrte, mask);
			
 
				+			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
			
 
				+
			
 
				+			// Accumulate on max hit
			
 
				+			mask = svalrte == maxidx;
			
 
				+			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
			
 
				+			cut_high_weight_err = select(cut_high_weight_err, accum, mask);
			
 
				+		}
			
 
				+
			
 
				+		// Write out min weight and weight span; clamp span to a usable range
			
 
				+		vint span = float_to_int(maxidx - minidx + vfloat(1));
			
 
				+		span = min(span, vint(max_quant_steps + 3));
			
 
				+		span = max(span, vint(2));
			
 
				+		storea(minidx, lowest_weight + sp);
			
 
				+		storea(span, weight_span + sp);
			
 
				+
			
 
				+		// The cut_(lowest/highest)_weight_error indicate the error that results from  forcing
			
 
				+		// samples that should have had the weight value one step (up/down).
			
 
				+		vfloat ssize = 1.0f / rcp_stepsize;
			
 
				+		vfloat errscale = ssize * ssize;
			
 
				+		storea(errval * errscale, error + sp);
			
 
				+		storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
			
 
				+		storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
			
 
				+
			
 
				+		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief The main function for the angular algorithm.
			
 
				+ *
			
 
				+ * @param      weight_count              The number of (decimated) weights.
			
 
				+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
			
 
				+ * @param      max_quant_level           The maximum quantization level to be tested.
			
 
				+ * @param[out] low_value                 Per angular step, the lowest weight value.
			
 
				+ * @param[out] high_value                Per angular step, the highest weight value.
			
 
				+ */
			
 
				+static void compute_angular_endpoints_for_quant_levels(
			
 
				+	unsigned int weight_count,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	unsigned int max_quant_level,
			
 
				+	float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
			
 
				+	float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
			
 
				+) {
			
 
				+	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
			
 
				+	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
			
 
				+
			
 
				+	alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
			
 
				+
			
 
				+	compute_angular_offsets(weight_count, dec_weight_ideal_value,
			
 
				+	                        max_angular_steps, angular_offsets);
			
 
				+
			
 
				+	alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
			
 
				+	alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
			
 
				+	alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
			
 
				+	alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
			
 
				+	alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
			
 
				+
			
 
				+	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
			
 
				+	                                  max_angular_steps, max_quant_steps,
			
 
				+	                                  angular_offsets, lowest_weight, weight_span, error,
			
 
				+	                                  cut_low_weight_error, cut_high_weight_error);
			
 
				+
			
 
				+	// For each quantization level, find the best error terms. Use packed vectors so data-dependent
			
 
				+	// branches can become selects. This involves some integer to float casts, but the values are
			
 
				+	// small enough so they never round the wrong way.
			
 
				+	vfloat4 best_results[36];
			
 
				+
			
 
				+	// Initialize the array to some safe defaults
			
 
				+	promise(max_quant_steps > 0);
			
 
				+	for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
			
 
				+	{
			
 
				+		// Lane<0> = Best error
			
 
				+		// Lane<1> = Best scale; -1 indicates no solution found
			
 
				+		// Lane<2> = Cut low weight
			
 
				+		best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
			
 
				+	}
			
 
				+
			
 
				+	promise(max_angular_steps > 0);
			
 
				+	for (unsigned int i = 0; i < max_angular_steps; i++)
			
 
				+	{
			
 
				+		float i_flt = static_cast<float>(i);
			
 
				+
			
 
				+		int idx_span = weight_span[i];
			
 
				+
			
 
				+		float error_cut_low = error[i] + cut_low_weight_error[i];
			
 
				+		float error_cut_high = error[i] + cut_high_weight_error[i];
			
 
				+		float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
			
 
				+
			
 
				+		// Check best error against record N
			
 
				+		vfloat4 best_result = best_results[idx_span];
			
 
				+		vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
			
 
				+		vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
			
 
				+		best_results[idx_span] = select(best_result, new_result, mask);
			
 
				+
			
 
				+		// Check best error against record N-1 with either cut low or cut high
			
 
				+		best_result = best_results[idx_span - 1];
			
 
				+
			
 
				+		new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
			
 
				+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
			
 
				+		best_result = select(best_result, new_result, mask);
			
 
				+
			
 
				+		new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
			
 
				+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
			
 
				+		best_results[idx_span - 1] = select(best_result, new_result, mask);
			
 
				+
			
 
				+		// Check best error against record N-2 with both cut low and high
			
 
				+		best_result = best_results[idx_span - 2];
			
 
				+		new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
			
 
				+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
			
 
				+		best_results[idx_span - 2] = select(best_result, new_result, mask);
			
 
				+	}
			
 
				+
			
 
				+	for (unsigned int i = 0; i <= max_quant_level; i++)
			
 
				+	{
			
 
				+		unsigned int q = steps_for_quant_level[i];
			
 
				+		int bsi = static_cast<int>(best_results[q].lane<1>());
			
 
				+
			
 
				+		// Did we find anything?
			
 
				+#if defined(ASTCENC_DIAGNOSTICS)
			
 
				+		if ((bsi < 0) && print_once)
			
 
				+		{
			
 
				+			print_once = false;
			
 
				+			printf("INFO: Unable to find full encoding within search error limit.\n\n");
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				+		bsi = astc::max(0, bsi);
			
 
				+
			
 
				+		float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
			
 
				+		float hwi = lwi + static_cast<float>(q) - 1.0f;
			
 
				+
			
 
				+		float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
			
 
				+		low_value[i]  = (angular_offsets[bsi] + lwi) * stepsize;
			
 
				+		high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_angular_endpoints_1plane(
			
 
				+	bool only_always,
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	unsigned int max_weight_quant,
			
 
				+	compression_working_buffers& tmpbuf
			
 
				+) {
			
 
				+	float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
			
 
				+	float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
			
 
				+
			
 
				+	float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
			
 
				+	float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
			
 
				+
			
 
				+	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
			
 
				+	                                                : bsd.decimation_mode_count_selected;
			
 
				+	promise(max_decimation_modes > 0);
			
 
				+	for (unsigned int i = 0; i < max_decimation_modes; i++)
			
 
				+	{
			
 
				+		const decimation_mode& dm = bsd.decimation_modes[i];
			
 
				+		if (!dm.is_ref_1_plane(static_cast<quant_method>(max_weight_quant)))
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
			
 
				+
			
 
				+		unsigned int max_precision = dm.maxprec_1plane;
			
 
				+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
			
 
				+		{
			
 
				+			max_precision = TUNE_MAX_ANGULAR_QUANT;
			
 
				+		}
			
 
				+
			
 
				+		if (max_precision > max_weight_quant)
			
 
				+		{
			
 
				+			max_precision = max_weight_quant;
			
 
				+		}
			
 
				+
			
 
				+		compute_angular_endpoints_for_quant_levels(
			
 
				+		    weight_count,
			
 
				+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
			
 
				+		    max_precision, low_values[i], high_values[i]);
			
 
				+	}
			
 
				+
			
 
				+	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
			
 
				+	                                           : bsd.block_mode_count_1plane_selected;
			
 
				+	promise(max_block_modes > 0);
			
 
				+	for (unsigned int i = 0; i < max_block_modes; i++)
			
 
				+	{
			
 
				+		const block_mode& bm = bsd.block_modes[i];
			
 
				+		assert(!bm.is_dual_plane);
			
 
				+
			
 
				+		unsigned int quant_mode = bm.quant_mode;
			
 
				+		unsigned int decim_mode = bm.decimation_mode;
			
 
				+
			
 
				+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
			
 
				+		{
			
 
				+			low_value[i] = low_values[decim_mode][quant_mode];
			
 
				+			high_value[i] = high_values[decim_mode][quant_mode];
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			low_value[i] = 0.0f;
			
 
				+			high_value[i] = 1.0f;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* See header for documentation. */
			
 
				+void compute_angular_endpoints_2planes(
			
 
				+	const block_size_descriptor& bsd,
			
 
				+	const float* dec_weight_ideal_value,
			
 
				+	unsigned int max_weight_quant,
			
 
				+	compression_working_buffers& tmpbuf
			
 
				+) {
			
 
				+	float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
			
 
				+	float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
			
 
				+	float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
			
 
				+	float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
			
 
				+
			
 
				+	float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
			
 
				+	float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
			
 
				+	float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
			
 
				+	float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
			
 
				+
			
 
				+	promise(bsd.decimation_mode_count_selected > 0);
			
 
				+	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
			
 
				+	{
			
 
				+		const decimation_mode& dm = bsd.decimation_modes[i];
			
 
				+		if (!dm.is_ref_2_plane(static_cast<quant_method>(max_weight_quant)))
			
 
				+		{
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
			
 
				+
			
 
				+		unsigned int max_precision = dm.maxprec_2planes;
			
 
				+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
			
 
				+		{
			
 
				+			max_precision = TUNE_MAX_ANGULAR_QUANT;
			
 
				+		}
			
 
				+
			
 
				+		if (max_precision > max_weight_quant)
			
 
				+		{
			
 
				+			max_precision = max_weight_quant;
			
 
				+		}
			
 
				+
			
 
				+		compute_angular_endpoints_for_quant_levels(
			
 
				+		    weight_count,
			
 
				+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
			
 
				+		    max_precision, low_values1[i], high_values1[i]);
			
 
				+
			
 
				+		compute_angular_endpoints_for_quant_levels(
			
 
				+		    weight_count,
			
 
				+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
			
 
				+		    max_precision, low_values2[i], high_values2[i]);
			
 
				+	}
			
 
				+
			
 
				+	unsigned int start = bsd.block_mode_count_1plane_selected;
			
 
				+	unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
			
 
				+	for (unsigned int i = start; i < end; i++)
			
 
				+	{
			
 
				+		const block_mode& bm = bsd.block_modes[i];
			
 
				+		unsigned int quant_mode = bm.quant_mode;
			
 
				+		unsigned int decim_mode = bm.decimation_mode;
			
 
				+
			
 
				+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
			
 
				+		{
			
 
				+			low_value1[i] = low_values1[decim_mode][quant_mode];
			
 
				+			high_value1[i] = high_values1[decim_mode][quant_mode];
			
 
				+			low_value2[i] = low_values2[decim_mode][quant_mode];
			
 
				+			high_value2[i] = high_values2[decim_mode][quant_mode];
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			low_value1[i] = 0.0f;
			
 
				+			high_value1[i] = 1.0f;
			
 
				+			low_value2[i] = 0.0f;
			
 
				+			high_value2[i] = 1.0f;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp
+++ b/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp
@@ -0,0 +1,147 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2011-2021 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data tables for quantization transfer.
			
 
				+ */
			
 
				+
			
 
				+#include "astcenc_internal.h"
			
 
				+
			
 
				+#define _ 0 // Using _ to indicate an entry that will not be used.
			
 
				+
			
 
				+const quant_and_transfer_table quant_and_xfer_tables[12] {
			
 
				+	// QUANT2, range 0..1
			
 
				+	{
			
 
				+		{0, 64},
			
 
				+		{0, 1},
			
 
				+		{0, 64},
			
 
				+		{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
			
 
				+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
			
 
				+		 0x4000}
			
 
				+	},
			
 
				+	// QUANT_3, range 0..2
			
 
				+	{
			
 
				+		{0, 32, 64},
			
 
				+		{0, 1, 2},
			
 
				+		{0, 32, 64},
			
 
				+		{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
			
 
				+		 _,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
			
 
				+		 _,_,_,_,0x4020}
			
 
				+	},
			
 
				+	// QUANT_4, range 0..3
			
 
				+	{
			
 
				+		{0, 21, 43, 64},
			
 
				+		{0, 1, 2, 3},
			
 
				+		{0, 21, 43, 64},
			
 
				+		{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
			
 
				+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
			
 
				+		 _,_,_,_,_,_,_,_,0x402b}
			
 
				+	},
			
 
				+	//QUANT_5, range 0..4
			
 
				+	{
			
 
				+		{0, 16, 32, 48, 64},
			
 
				+		{0, 1, 2, 3, 4},
			
 
				+		{0, 16, 32, 48, 64},
			
 
				+		{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
			
 
				+		 _,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
			
 
				+		 _,_,_,_,_,_,_,_,_,_,_,_,0x4030}
			
 
				+	},
			
 
				+	// QUANT_6, range 0..5
			
 
				+	{
			
 
				+		{0, 12, 25, 39, 52, 64},
			
 
				+		{0, 2, 4, 5, 3, 1},
			
 
				+		{0, 64, 12, 52, 25, 39},
			
 
				+		{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
			
 
				+		 0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
			
 
				+		 _,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
			
 
				+	},
			
 
				+	// QUANT_8, range 0..7
			
 
				+	{
			
 
				+		{0, 9, 18, 27, 37, 46, 55, 64},
			
 
				+		{0, 1, 2, 3, 4, 5, 6, 7},
			
 
				+		{0, 9, 18, 27, 37, 46, 55, 64},
			
 
				+		{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
			
 
				+		 _,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
			
 
				+		 0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
			
 
				+	},
			
 
				+	// QUANT_10, range 0..9
			
 
				+	{
			
 
				+		{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
			
 
				+		{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
			
 
				+		{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
			
 
				+		{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
			
 
				+		 0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
			
 
				+		 _,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
			
 
				+		 _,0x4039}
			
 
				+	},
			
 
				+	// QUANT_12, range 0..11
			
 
				+	{
			
 
				+		{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
			
 
				+		{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
			
 
				+		{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
			
 
				+		{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
			
 
				+		 0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
			
 
				+		 0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
			
 
				+		 0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
			
 
				+	},
			
 
				+	// QUANT_16, range 0..15
			
 
				+	{
			
 
				+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
			
 
				+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
			
 
				+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
			
 
				+		{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
			
 
				+		 0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
			
 
				+		 _,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
			
 
				+		 _,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
			
 
				+	},
			
 
				+	// QUANT_20, range 0..19
			
 
				+	{
			
 
				+		{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
			
 
				+		{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
			
 
				+		{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
			
 
				+		{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
			
 
				+		 0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
			
 
				+		 0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
			
 
				+		 0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
			
 
				+		 0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
			
 
				+	},
			
 
				+	// QUANT_24, range 0..23
			
 
				+	{
			
 
				+		{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
			
 
				+		{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
			
 
				+		{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
			
 
				+		{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
			
 
				+		 _,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
			
 
				+		 0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
			
 
				+		 0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
			
 
				+		 _,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
			
 
				+		 0x403b,_,0x403e}
			
 
				+	},
			
 
				+	// QUANT_32, range 0..31
			
 
				+	{
			
 
				+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
			
 
				+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
			
 
				+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
			
 
				+		{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
			
 
				+		 0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
			
 
				+		 0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
			
 
				+		 0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
			
 
				+		 0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
			
 
				+		 0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
			
 
				+		 0x403c,_,0x403e}
			
 
				+	}
			
 
				+};