Prechádzať zdrojové kódy

Merge pull request #103968 from Chubercik/basis_universal-1.60

basis_universal: Update to 1.60
Thaddeus Crews 3 mesiacov pred
rodič
commit
be994d59c9
43 zmenil súbory, kde vykonal 23489 pridanie a 5251 odobranie
  1. 5 3
      modules/basis_universal/SCsub
  2. 7 7
      modules/basis_universal/image_compress_basisu.cpp
  3. 4 1
      thirdparty/README.md
  4. 8 0
      thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp
  5. 7015 0
      thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp
  6. 129 0
      thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h
  7. 5357 0
      thirdparty/basis_universal/encoder/basisu_astc_hdr_common.cpp
  8. 423 0
      thirdparty/basis_universal/encoder/basisu_astc_hdr_common.h
  9. 0 3310
      thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp
  10. 396 161
      thirdparty/basis_universal/encoder/basisu_comp.cpp
  11. 209 64
      thirdparty/basis_universal/encoder/basisu_comp.h
  12. 328 46
      thirdparty/basis_universal/encoder/basisu_enc.cpp
  13. 432 151
      thirdparty/basis_universal/encoder/basisu_enc.h
  14. 16 94
      thirdparty/basis_universal/encoder/basisu_frontend.cpp
  15. 74 5
      thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
  16. 3146 0
      thirdparty/basis_universal/encoder/basisu_math.h
  17. 15 7
      thirdparty/basis_universal/encoder/basisu_opencl.cpp
  18. 1 1
      thirdparty/basis_universal/encoder/basisu_opencl.h
  19. 11 15
      thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
  20. 1 1
      thirdparty/basis_universal/encoder/basisu_resampler.cpp
  21. 2 0
      thirdparty/basis_universal/encoder/basisu_resampler.h
  22. 12 0
      thirdparty/basis_universal/encoder/basisu_resampler_filters.h
  23. 2 0
      thirdparty/basis_universal/encoder/basisu_ssim.cpp
  24. 1 32
      thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
  25. 1277 0
      thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.cpp
  26. 16 59
      thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.h
  27. 3 3
      thirdparty/basis_universal/patches/0001-external-zstd-pr344.patch
  28. 11 2
      thirdparty/basis_universal/patches/0002-external-jpgd.patch
  29. 3 3
      thirdparty/basis_universal/patches/0003-external-tinyexr.patch
  30. 11 11
      thirdparty/basis_universal/patches/0004-remove-tinydds-qoi.patch
  31. 13 0
      thirdparty/basis_universal/patches/0005-windows-illegal-character.patch
  32. 22 0
      thirdparty/basis_universal/patches/0006-ambiguous-calls.patch
  33. 163 54
      thirdparty/basis_universal/transcoder/basisu.h
  34. 130 26
      thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h
  35. 208 80
      thirdparty/basis_universal/transcoder/basisu_astc_helpers.h
  36. 2152 693
      thirdparty/basis_universal/transcoder/basisu_containers.h
  37. 778 280
      thirdparty/basis_universal/transcoder/basisu_containers_impl.h
  38. 7 4
      thirdparty/basis_universal/transcoder/basisu_file_headers.h
  39. 769 81
      thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
  40. 253 53
      thirdparty/basis_universal/transcoder/basisu_transcoder.h
  41. 65 3
      thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
  42. 1 1
      thirdparty/libktx/lib/basis_transcode.cpp
  43. 13 0
      thirdparty/libktx/patches/0003-basisu-1.60.patch

+ 5 - 3
modules/basis_universal/SCsub

@@ -20,22 +20,24 @@ basisu_encoder = env.editor_build
 if basisu_encoder:
     encoder_sources = [
         "3rdparty/android_astc_decomp.cpp",
-        "basisu_astc_hdr_enc.cpp",
+        "basisu_astc_hdr_6x6_enc.cpp",
+        "basisu_astc_hdr_common.cpp",
         "basisu_backend.cpp",
         "basisu_basis_file.cpp",
         "basisu_bc7enc.cpp",
-        "basisu_opencl.cpp",
         "basisu_comp.cpp",
         "basisu_enc.cpp",
         "basisu_etc.cpp",
         "basisu_frontend.cpp",
         "basisu_gpu_texture.cpp",
         "basisu_kernels_sse.cpp",
+        "basisu_opencl.cpp",
         "basisu_pvrtc1_4.cpp",
-        "basisu_resampler.cpp",
         "basisu_resample_filters.cpp",
+        "basisu_resampler.cpp",
         "basisu_ssim.cpp",
         "basisu_uastc_enc.cpp",
+        "basisu_uastc_hdr_4x4_enc.cpp",
         "pvpngreader.cpp",
     ]
     encoder_sources = [thirdparty_dir + "encoder/" + file for file in encoder_sources]

+ 7 - 7
modules/basis_universal/image_compress_basisu.cpp

@@ -101,13 +101,13 @@ Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedCha
 	basisu::basis_compressor_params params;
 
 	params.m_uastc = true;
-	params.m_quality_level = basisu::BASISU_QUALITY_MIN;
-	params.m_pack_uastc_flags &= ~basisu::cPackUASTCLevelMask;
-	params.m_pack_uastc_flags |= basisu::cPackUASTCLevelFastest;
+	params.m_etc1s_quality_level = basisu::BASISU_QUALITY_MIN;
+	params.m_pack_uastc_ldr_4x4_flags &= ~basisu::cPackUASTCLevelMask;
+	params.m_pack_uastc_ldr_4x4_flags |= basisu::cPackUASTCLevelFastest;
 
-	params.m_rdo_uastc = 0.0f;
-	params.m_rdo_uastc_quality_scalar = 0.0f;
-	params.m_rdo_uastc_dict_size = 1024;
+	params.m_rdo_uastc_ldr_4x4 = 0.0f;
+	params.m_rdo_uastc_ldr_4x4_quality_scalar = 0.0f;
+	params.m_rdo_uastc_ldr_4x4_dict_size = 1024;
 
 	params.m_mip_fast = true;
 	params.m_multithreading = true;
@@ -127,7 +127,7 @@ Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedCha
 	if (is_hdr) {
 		decompress_format = BASIS_DECOMPRESS_HDR_RGB;
 		params.m_hdr = true;
-		params.m_uastc_hdr_options.set_quality_level(0);
+		params.m_uastc_hdr_4x4_options.set_quality_level(0);
 
 	} else {
 		switch (p_channels) {

+ 4 - 1
thirdparty/README.md

@@ -74,7 +74,7 @@ Files extracted from upstream source:
 ## basis_universal
 
 - Upstream: https://github.com/BinomialLLC/basis_universal
-- Version: 1.50.0 (051ad6d8a64bb95a79e8601c317055fd1782ad3e, 2024)
+- Version: 1.60 (323239a6a5ffa57d6570cfc403be99156e33a8b0, 2025)
 - License: Apache 2.0
 
 Files extracted from upstream source:
@@ -89,6 +89,8 @@ Patches:
 - `0002-external-jpgd.patch` (GH-88508)
 - `0003-external-tinyexr.patch` (GH-97582)
 - `0004-remove-tinydds-qoi.patch` (GH-97582)
+- `0005-windows-illegal-character.patch` (GH-103968)
+- `0006-ambiguous-calls.patch` (GH-103968)
 
 
 ## brotli
@@ -523,6 +525,7 @@ Patches:
 
 - `0001-external-basisu.patch` (GH-76572)
 - `0002-disable-astc-block-ext.patch` (GH-76572)
+- `0003-basisu-1.60.patch` (GH-103968)
 
 
 ## libogg

+ 8 - 0
thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp

@@ -836,10 +836,12 @@ void decodeISETritBlock (ISEDecodedResult* dst, int numValues, BitAccessStream&
     m[4]            = data.getNext(numBits);
     deUint32 T7     = data.getNext(1);
 
+#ifndef __EMSCRIPTEN__
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wimplicit-fallthrough="            
 #endif  
+#endif
     switch (numValues)
     {
         // \note Fall-throughs.
@@ -851,9 +853,11 @@ void decodeISETritBlock (ISEDecodedResult* dst, int numValues, BitAccessStream&
         default:
             DE_ASSERT(false);
     }
+#ifndef __EMSCRIPTEN__
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif 
+#endif
 
     const deUint32 T = (T7 << 7) | (T56 << 5) | (T4 << 4) | (T23 << 2) | (T01 << 0);
 
@@ -898,10 +902,12 @@ void decodeISEQuintBlock (ISEDecodedResult* dst, int numValues, BitAccessStream&
     m[2]            = data.getNext(numBits);
     deUint32 Q56    = data.getNext(2);
 
+#ifndef __EMSCRIPTEN__
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wimplicit-fallthrough="            
 #endif  
+#endif
     switch (numValues)
     {
         // \note Fall-throughs.
@@ -911,9 +917,11 @@ void decodeISEQuintBlock (ISEDecodedResult* dst, int numValues, BitAccessStream&
         default:
             DE_ASSERT(false);
     }
+#ifndef __EMSCRIPTEN__
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif 
+#endif
 
     const deUint32 Q = (Q56 << 5) | (Q34 << 3) | (Q012 << 0);
 

+ 7015 - 0
thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp

@@ -0,0 +1,7015 @@
+// File: basisu_astc_hdr_6x6_enc.cpp
+#include "basisu_astc_hdr_6x6_enc.h"
+#include "basisu_enc.h"
+#include "basisu_astc_hdr_common.h"
+#include "basisu_math.h"
+#include "basisu_resampler.h"
+#include "basisu_resampler_filters.h"
+
+#define MINIZ_HEADER_FILE_ONLY
+#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+#include "basisu_miniz.h"
+
+#include "3rdparty/android_astc_decomp.h"
+
+#include <array>
+
+using namespace basisu;
+using namespace buminiz;
+using namespace basist::astc_6x6_hdr;
+
+namespace astc_6x6_hdr
+{
+
+static void atomic_max(std::atomic<uint32_t>& atomic_var, uint32_t new_value) 
+{
+	uint32_t current = atomic_var.load(std::memory_order_relaxed);
+	for ( ; ; )
+	{
+		uint32_t new_max = std::max(current, new_value);
+		if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed)) 
+			break;
+	}
+}
+
+void astc_hdr_6x6_global_config::set_user_level(int level)
+{
+	level = basisu::clamp<int>(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL);
+
+	m_master_comp_level = 0;
+	m_highest_comp_level = 0;
+	m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS;
+	m_extra_patterns_flag = false;
+	m_brute_force_partition_matching = false;
+
+	switch (level)
+	{
+	case 0:
+	{
+		// Both reduce compression a lot when lambda>0
+		m_favor_higher_compression = false;
+		m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2;
+		break;
+	}
+	case 1:
+	{
+		m_master_comp_level = 0;
+		m_highest_comp_level = 0;
+		break;
+	}
+	case 2:
+	{
+		m_master_comp_level = 0;
+		m_highest_comp_level = 1;
+		break;
+	}
+	case 3:
+	{
+		m_master_comp_level = 1;
+		m_highest_comp_level = 1;
+		break;
+	}
+	case 4:
+	{
+		m_master_comp_level = 1;
+		m_highest_comp_level = 2;
+		break;
+	}
+	case 5:
+	{
+		m_master_comp_level = 1;
+		m_highest_comp_level = 3;
+		break;
+	}
+	case 6:
+	{
+		m_master_comp_level = 1;
+		m_highest_comp_level = 4;
+		break;
+	}
+	case 7:
+	{
+		m_master_comp_level = 2;
+		m_highest_comp_level = 2;
+		break;
+	}
+	case 8:
+	{
+		m_master_comp_level = 2;
+		m_highest_comp_level = 3;
+		break;
+	}
+	case 9:
+	{
+		m_master_comp_level = 2;
+		m_highest_comp_level = 4;
+		break;
+	}
+	case 10:
+	{
+		m_master_comp_level = 3;
+		m_highest_comp_level = 3;
+		break;
+	}
+	case 11:
+	{
+		m_master_comp_level = 3;
+		m_highest_comp_level = 4;
+		break;
+	}
+	case 12:
+	default:
+	{
+		m_master_comp_level = 4;
+		m_highest_comp_level = 4;
+		m_extra_patterns_flag = true;
+		m_brute_force_partition_matching = true;
+		break;
+	}
+	}
+}
+
+const float m1 = 0.1593017578125f;    // (2610 / 2^14) * (1/100)
+const float m2 = 78.84375f;           // (2523 / 32) * (1/100)
+const float c1 = 0.8359375f;          // 3424 / (2^12)
+const float c2 = 18.8515625f;         // (2413 / 128)
+const float c3 = 18.6875f;            // (2392 / 128)
+
+static float forwardPQ(float Y)
+{
+	// 10,000 here is an absolute scale - it's in nits (cd per square meter)
+	float L = Y * (1.0f / 10000.0f);
+
+	float num = powf(L, m1);
+	float N = powf((c1 + c2 * num) / (1 + c3 * num), m2);
+
+	return N;
+}
+
+#if 0
+static float inversePQ(float E)
+{
+	float N = powf(E, 1.0f / m2);
+
+	float num = basisu::maximum<float>((N - c1), 0.0f) / (c2 - c3 * N);
+	float L = powf(num, 1.0f / m1);
+
+	return L * 10000.0f;
+}
+#endif
+
+// PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries.
+// max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86
+// Highest error is for values less than SMALLEST_PQ_VAL_IN.
+//
+// Approximation is round trip lossless for 10-12 bits at [0,10000] nits:
+// for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096): 
+// round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x
+//
+// bfloat16 has enough precision to handle 8-bit sRGB to linear conversions:
+// round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless
+
+const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16;
+const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1);
+
+const float SMALLEST_PQ_VAL_IN = 0.000015258829080f;
+const float SMALLEST_PQ_VAL = 0.000551903737f;		// forwardPQ(SMALLEST_PQ_VAL_IN)
+
+const float LARGEST_PQ_VAL = 1.251312f; 
+
+float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128];
+
+static void init_pq_tables()
+{
+	for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++)
+	{
+		for (int mant = 0; mant < 128; mant++)
+		{
+			bfloat16 b = bfloat16_init(1, exp, mant);
+			float bf = bfloat16_to_float(b);
+
+			float pq = forwardPQ(bf);
+
+			g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq;
+		}
+	}
+
+	//fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0]));
+	//fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN));
+}
+
+static inline float forwardPQTab(float v)
+{
+	assert(g_pq_approx_tabs[0][0]);
+
+	assert(v >= 0.0f);
+	if (v == 0.0f)
+		return 0.0f;
+
+	bfloat16 bf = float_to_bfloat16(v, false);
+	assert(v >= bfloat16_to_float(bf));
+
+	int exp = bfloat16_get_exp(bf);
+
+	if (exp < PQ_APPROX_MIN_EXP)
+	{
+		// not accurate but should be good enough for our uses
+		return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN));
+	}
+	else if (exp > PQ_APPROX_MAX_EXP)
+		return LARGEST_PQ_VAL;
+
+	int mant = bfloat16_get_mantissa(bf);
+
+	float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant];
+	float bf_f32 = bfloat16_to_float(bf);
+
+	int next_mant = mant + 1;
+	int next_exp = exp;
+	if (next_mant == 128)
+	{
+		next_mant = 0;
+		next_exp++;
+		if (next_exp > PQ_APPROX_MAX_EXP)
+			return a;
+	}
+
+	float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant];
+
+	bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant);
+	float next_bf_f32 = bfloat16_to_float(next_bf);
+	assert(v <= next_bf_f32);
+
+	float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32);
+	assert((lerp_factor >= 0) && (lerp_factor <= 1.0f));
+
+	return lerp(a, b, lerp_factor);
+}
+
+// 100 nits = ~.5 i
+// This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2. 
+// To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true).
+// Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true.
+//
+// ITP info:
+// https://www.portrait.com/resource-center/ictcp-color-difference-metric/
+// https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's)
+// This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP.
+//
+// Linear REC709 to REC2020/BT.2100 gamut conversion:
+// rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f;
+// rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f;
+// rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f;
+// const float S = 1.0f / 4096.0f;
+// l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2];
+// m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2];
+// s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2];
+static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false)
+{
+	vec3F rgb_2100(rgb_in);
+	
+	float l, m, s;
+	if (!rec2020_bt2100_color_gamut)
+	{
+		// Assume REC 709 input color gamut
+		// (REC2020_to_LMS * REC709_to_2020) * input_color
+		l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f;
+		m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f;
+		s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f;
+	}
+	else
+	{
+		// Assumes REC2020/BT.2100 input color gamut (this is from the spec)
+		l = 0.412109375f    * rgb_2100[0] + 0.52392578125f  * rgb_2100[1] + 0.06396484375f * rgb_2100[2];
+		m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2];
+		s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f   * rgb_2100[2];
+	}
+
+	float ld = forwardPQTab(l);
+	float md = forwardPQTab(m);
+	float sd = forwardPQTab(s);
+
+	ictcp[0] = .5f * ld + .5f * md;
+
+	// if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear)
+	if (itp_flag)
+		ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd;
+	else
+		ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd;
+
+	ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd;
+}
+
+static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg)
+{
+	linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut);
+}
+
+#if 0
+// Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut).
+static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false)
+{
+	float ct = ictcp[1];
+
+	if (itp_flag)
+		ct *= 2.0f;
+
+	float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f;
+	float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f;
+	float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f;
+
+	float l = inversePQ(ld);
+	float m = inversePQ(md);
+	float s = inversePQ(sd);
+
+	rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f;
+	rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f;
+	rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f;
+}
+#endif
+
+struct half_vec3
+{
+	basist::half_float m_vals[3];
+
+	inline half_vec3() { }
+
+	inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z)
+	{
+		m_vals[0] = x;
+		m_vals[1] = y;
+		m_vals[2] = z;
+	}
+
+	inline half_vec3(const half_vec3& other)
+	{
+		*this = other;
+	}
+
+	inline half_vec3& operator= (const half_vec3& rhs)
+	{
+		m_vals[0] = rhs.m_vals[0];
+		m_vals[1] = rhs.m_vals[1];
+		m_vals[2] = rhs.m_vals[2];
+		return *this;
+	}
+
+	inline void clear()
+	{
+		clear_obj(m_vals);
+	}
+
+	inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z)
+	{
+		m_vals[0] = x;
+		m_vals[1] = y;
+		m_vals[2] = z;
+		return *this;
+	}
+
+	inline half_vec3& set(float x, float y, float z)
+	{
+		m_vals[0] = basist::float_to_half(x);
+		m_vals[1] = basist::float_to_half(y);
+		m_vals[2] = basist::float_to_half(z);
+		return *this;
+	}
+
+	template<typename T>
+	inline half_vec3& set_vec(const T& vec)
+	{
+		m_vals[0] = basist::float_to_half(vec[0]);
+		m_vals[1] = basist::float_to_half(vec[1]);
+		m_vals[2] = basist::float_to_half(vec[2]);
+		return *this;
+	}
+
+	template<typename T>
+	inline T get_vec() const
+	{
+		return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]));
+	}
+
+	inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; }
+	inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; }
+
+	float get_float_comp(uint32_t c) const
+	{
+		assert(c < 3);
+		return basist::half_to_float(m_vals[c]);
+	}
+
+	half_vec3& set_float_comp(uint32_t c, float v)
+	{
+		assert(c < 3);
+		m_vals[c] = basist::float_to_half(v);
+		return *this;
+	}
+};
+
+struct half_vec4
+{
+	basist::half_float m_vals[4];
+
+	inline half_vec4() { }
+
+	inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
+	{
+		m_vals[0] = x;
+		m_vals[1] = y;
+		m_vals[2] = z;
+		m_vals[3] = w;
+	}
+
+	inline half_vec4(const half_vec4& other)
+	{
+		*this = other;
+	}
+
+	inline half_vec4& operator= (const half_vec4& rhs)
+	{
+		m_vals[0] = rhs.m_vals[0];
+		m_vals[1] = rhs.m_vals[1];
+		m_vals[2] = rhs.m_vals[2];
+		m_vals[3] = rhs.m_vals[3];
+		return *this;
+	}
+
+	inline void clear()
+	{
+		clear_obj(m_vals);
+	}
+
+	inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
+	{
+		m_vals[0] = x;
+		m_vals[1] = y;
+		m_vals[2] = z;
+		m_vals[3] = w;
+		return *this;
+	}
+
+	inline half_vec4& set(float x, float y, float z, float w)
+	{
+		m_vals[0] = basist::float_to_half(x);
+		m_vals[1] = basist::float_to_half(y);
+		m_vals[2] = basist::float_to_half(z);
+		m_vals[3] = basist::float_to_half(w);
+		return *this;
+	}
+
+	template<typename T>
+	inline half_vec4& set_vec(const T& vec)
+	{
+		m_vals[0] = basist::float_to_half(vec[0]);
+		m_vals[1] = basist::float_to_half(vec[1]);
+		m_vals[2] = basist::float_to_half(vec[2]);
+		m_vals[3] = basist::float_to_half(vec[3]);
+		return *this;
+	}
+
+	template<typename T>
+	inline T get_vec() const
+	{
+		return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3]));
+	}
+
+	inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; }
+	inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; }
+
+	float get_float_comp(uint32_t c) const
+	{
+		assert(c < 4);
+		return basist::half_to_float(m_vals[c]);
+	}
+
+	half_vec4& set_float_comp(uint32_t c, float v)
+	{
+		assert(c < 4);
+		m_vals[c] = basist::float_to_half(v);
+		return *this;
+	}
+};
+
+const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6;
+
+struct trial_result
+{
+	astc_helpers::log_astc_block m_log_blk;
+	double m_err;
+	bool m_valid;
+};
+
+//----------------------------------------------------------
+
+const uint32_t NUM_PART3_MAPPINGS = 6;
+static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] =
+{
+	{ 0, 1, 2 },
+	{ 1, 2, 0 },
+	{ 2, 0, 1 },
+	{ 0, 2, 1 },
+	{ 1, 0, 2 },
+	{ 2, 1, 0 }
+};
+
+struct partition_pattern_vec
+{
+	uint8_t m_parts[6 * 6];
+
+	partition_pattern_vec()
+	{
+		clear();
+	}
+
+	partition_pattern_vec(const partition_pattern_vec& other)
+	{
+		*this = other;
+	}
+
+	void clear()
+	{
+		memset(m_parts, 0, sizeof(m_parts));
+	}
+
+	partition_pattern_vec& operator= (const partition_pattern_vec& rhs)
+	{
+		if (this == &rhs)
+			return *this;
+		memcpy(m_parts, rhs.m_parts, 36);
+		return *this;
+	}
+
+	uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; }
+	uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; }
+
+	uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
+	uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
+
+	int get_squared_distance(const partition_pattern_vec& other) const
+	{
+		int total_dist = 0;
+		for (uint32_t i = 0; i < 36; i++)
+			total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]);
+		return total_dist;
+	}
+
+	float get_distance(const partition_pattern_vec& other) const
+	{
+		return sqrtf((float)get_squared_distance(other));
+	}
+
+	partition_pattern_vec get_permuted2(uint32_t permute_index) const
+	{
+		assert(permute_index <= 1);
+
+		partition_pattern_vec res;
+		for (uint32_t i = 0; i < 36; i++)
+		{
+			assert(m_parts[i] <= 1);
+			res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index);
+		}
+
+		return res;
+	}
+
+	partition_pattern_vec get_permuted3(uint32_t permute_index) const
+	{
+		assert(permute_index <= 5);
+
+		partition_pattern_vec res;
+		for (uint32_t i = 0; i < 36; i++)
+		{
+			assert(m_parts[i] <= 2);
+			res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]];
+		}
+
+		return res;
+	}
+
+	partition_pattern_vec get_canonicalized() const
+	{
+		partition_pattern_vec res;
+
+		int new_labels[3] = { -1, -1, -1 };
+		uint32_t next_index = 0;
+		for (uint32_t i = 0; i < 36; i++)
+		{
+			uint32_t p = m_parts[i];
+			if (new_labels[p] == -1)
+				new_labels[p] = next_index++;
+
+			res.m_parts[i] = (uint8_t)new_labels[p];
+		}
+
+		return res;
+	}
+
+	bool operator== (const partition_pattern_vec& rhs) const
+	{
+		return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0;
+	}
+
+	operator size_t() const
+	{
+		return basisu::hash_hsieh(m_parts, sizeof(m_parts));
+	}
+};
+
+struct vp_tree_node
+{
+	partition_pattern_vec m_vantage_point;
+	uint32_t m_point_index;
+	float m_dist;
+
+	int m_inner_node, m_outer_node;
+};
+
+#define BRUTE_FORCE_PART_SEARCH (0)
+
+class vp_tree
+{
+public:
+	vp_tree()
+	{
+	}
+
+	void clear()
+	{
+		m_nodes.clear();
+	}
+
+	// This requires no redundant patterns, i.e. all must be unique.
+	bool init(uint32_t n, const partition_pattern_vec* pUnique_pats)
+	{
+		clear();
+
+		uint_vec pat_indices(n);
+		for (uint32_t i = 0; i < n; i++)
+			pat_indices[i] = i;
+
+		std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);
+
+		if (root_idx.first == -1)
+			return false;
+
+		m_nodes.resize(1);
+		m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first];
+		m_nodes[0].m_point_index = root_idx.first;
+		m_nodes[0].m_dist = root_idx.second;
+		m_nodes[0].m_inner_node = -1;
+		m_nodes[0].m_outer_node = -1;
+
+		uint_vec inner_list, outer_list;
+		
+		inner_list.reserve(n / 2);
+		outer_list.reserve(n / 2);
+
+		for (uint32_t pat_index = 0; pat_index < n; pat_index++)
+		{
+			if ((int)pat_index == root_idx.first)
+				continue;
+
+			const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]);
+
+			if (dist <= root_idx.second)
+				inner_list.push_back(pat_index);
+			else
+				outer_list.push_back(pat_index);
+		}
+
+		if (inner_list.size())
+		{
+			m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list);
+			if (m_nodes[0].m_inner_node < 0)
+				return false;
+		}
+
+		if (outer_list.size())
+		{
+			m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list);
+			if (m_nodes[0].m_outer_node < 0)
+				return false;
+		}
+
+		return true;
+	}
+
+	struct result
+	{
+		uint32_t m_pat_index;
+		uint32_t m_mapping_index;
+		float m_dist;
+
+		bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; }
+		bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; }
+	};
+
+	class result_queue
+	{
+		enum { MaxSupportedSize = 256 + 1 };
+
+	public:
+		result_queue() : 
+			m_cur_size(0) 
+		{
+		}
+
+		size_t get_size() const
+		{
+			return m_cur_size;
+		}
+
+		bool empty() const
+		{
+			return !m_cur_size;
+		}
+
+		typedef std::array<result, MaxSupportedSize + 1> result_array_type;
+
+		const result_array_type& get_elements() const { return m_elements; }
+		result_array_type& get_elements() { return m_elements; }
+
+		void clear()
+		{
+			m_cur_size = 0;
+		}
+
+		void reserve(uint32_t n)
+		{
+			BASISU_NOTE_UNUSED(n);
+		}
+
+		const result& top() const
+		{
+			assert(m_cur_size);
+			return m_elements[1];
+		}
+
+		bool insert(const result& val, uint32_t max_size)
+		{
+			assert(max_size < MaxSupportedSize);
+
+			if (m_cur_size >= MaxSupportedSize)
+				return false;
+
+			m_elements[++m_cur_size] = val;
+			up_heap(m_cur_size);
+
+			if (m_cur_size > max_size)
+				pop();
+
+			return true;
+		}
+
+		bool pop()
+		{
+			if (m_cur_size == 0) 
+				return false;
+
+			m_elements[1] = m_elements[m_cur_size--];
+			down_heap(1);
+			return true;
+		}
+								
+		float get_highest_dist() const
+		{
+			if (!m_cur_size)
+				return 0.0f;
+
+			return top().m_dist;
+		}
+	
+	private:
+		result_array_type m_elements;
+		size_t m_cur_size;
+
+		void up_heap(size_t index)
+		{
+			while ((index > 1) && (m_elements[index] > m_elements[index >> 1]))
+			{
+				std::swap(m_elements[index], m_elements[index >> 1]);
+				index >>= 1;
+			}
+		}
+
+		void down_heap(size_t index)
+		{
+			for ( ; ; )
+			{
+				size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1;
+
+				if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest]))
+					largest = left_child;
+
+				if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest]))
+					largest = right_child;
+
+				if (largest == index)
+					break;
+
+				std::swap(m_elements[index], m_elements[largest]);
+				index = largest;
+			}
+		}
+	};
+		
+	void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results)
+	{
+		assert((num_subsets >= 2) && (num_subsets <= 3));
+
+		results.clear();
+
+		if (!m_nodes.size())
+			return;
+
+		uint32_t num_desired_pats;
+		partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS];
+
+		if (num_subsets == 2)
+		{
+			num_desired_pats = 2;
+			for (uint32_t i = 0; i < 2; i++)
+				desired_pats[i] = desired_pat.get_permuted2(i);
+		}
+		else
+		{
+			num_desired_pats = NUM_PART3_MAPPINGS;
+			for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++)
+				desired_pats[i] = desired_pat.get_permuted3(i);
+		}
+
+#if 0
+		find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results);
+#else
+		find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results);
+#endif
+	}
+
+private:
+	basisu::vector<vp_tree_node> m_nodes;
+
+	void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
+	{
+		float best_dist_to_vantage = BIG_FLOAT_VAL;
+		uint32_t best_mapping = 0;
+		for (uint32_t i = 0; i < num_desired_pats; i++)
+		{
+			float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
+			if (dist < best_dist_to_vantage)
+			{
+				best_dist_to_vantage = dist;
+				best_mapping = i;
+			}
+		}
+
+		result r;
+		r.m_dist = best_dist_to_vantage;
+		r.m_mapping_index = best_mapping;
+		r.m_pat_index = m_nodes[node_index].m_point_index;
+
+		results.insert(r, max_results);
+
+		if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
+		{
+			// inner first
+			if (m_nodes[node_index].m_inner_node >= 0)
+				find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
+
+			if (m_nodes[node_index].m_outer_node >= 0)
+			{
+				if ( (results.get_size() < max_results) || 
+					((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
+					)
+				{
+					find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
+				}
+			}
+		}
+		else
+		{
+			// outer first
+			if (m_nodes[node_index].m_outer_node >= 0)
+				find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
+
+			if (m_nodes[node_index].m_inner_node >= 0)
+			{
+				if ( (results.get_size() < max_results) || 
+					((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
+					)
+				{
+					find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
+				}
+			}
+		}
+	}
+		
+	void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
+	{
+		uint_vec node_stack;
+		node_stack.reserve(16);
+		node_stack.push_back(init_node_index);
+		
+		do
+		{
+			const uint32_t node_index = node_stack.back();
+			node_stack.pop_back();
+
+			float best_dist_to_vantage = BIG_FLOAT_VAL;
+			uint32_t best_mapping = 0;
+			for (uint32_t i = 0; i < num_desired_pats; i++)
+			{
+				float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
+				if (dist < best_dist_to_vantage)
+				{
+					best_dist_to_vantage = dist;
+					best_mapping = i;
+				}
+			}
+
+			result r;
+			r.m_dist = best_dist_to_vantage;
+			r.m_mapping_index = best_mapping;
+			r.m_pat_index = m_nodes[node_index].m_point_index;
+
+			results.insert(r, max_results);
+
+			if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
+			{
+				if (m_nodes[node_index].m_outer_node >= 0)
+				{
+					if ((results.get_size() < max_results) ||
+						((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
+						)
+					{
+						node_stack.push_back(m_nodes[node_index].m_outer_node);
+					}
+				}
+
+				// inner first
+				if (m_nodes[node_index].m_inner_node >= 0)
+				{
+					node_stack.push_back(m_nodes[node_index].m_inner_node);
+				}
+			}
+			else
+			{
+				if (m_nodes[node_index].m_inner_node >= 0)
+				{
+					if ((results.get_size() < max_results) ||
+						((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
+						)
+					{
+						node_stack.push_back(m_nodes[node_index].m_inner_node);
+					}
+				}
+
+				// outer first
+				if (m_nodes[node_index].m_outer_node >= 0)
+				{
+					node_stack.push_back(m_nodes[node_index].m_outer_node);
+				}
+			}
+
+		} while (!node_stack.empty());
+	}
+
+	// returns the index of the new node, or -1 on error
+	int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices)
+	{
+		std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);
+
+		if (root_idx.first < 0)
+			return -1;
+
+		m_nodes.resize(m_nodes.size() + 1);
+		const uint32_t new_node_index = m_nodes.size_u32() - 1;
+				
+		m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first];
+		m_nodes[new_node_index].m_point_index = root_idx.first;
+		m_nodes[new_node_index].m_dist = root_idx.second;
+		m_nodes[new_node_index].m_inner_node = -1;
+		m_nodes[new_node_index].m_outer_node = -1;
+
+		uint_vec inner_list, outer_list;
+
+		inner_list.reserve(pat_indices.size_u32() / 2);
+		outer_list.reserve(pat_indices.size_u32() / 2);
+
+		for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++)
+		{
+			const uint32_t pat_index = pat_indices[pat_indices_iter];
+
+			if ((int)pat_index == root_idx.first)
+				continue;
+
+			const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]);
+
+			if (dist <= root_idx.second)
+				inner_list.push_back(pat_index);
+			else
+				outer_list.push_back(pat_index);
+		}
+
+		if (inner_list.size())
+			m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list);
+
+		if (outer_list.size())
+			m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list);
+
+		return new_node_index;
+	}
+
+	// returns the pattern index of the vantage point (-1 on error), and the optimal split distance
+	std::pair<int, float> find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices)
+	{
+		BASISU_NOTE_UNUSED(num_unique_pats);
+
+		const uint32_t n = pat_indices.size_u32();
+
+		assert(n);
+		if (n == 1)
+			return std::pair(pat_indices[0], 0.0f);
+
+		float best_split_metric = -1.0f;
+		int best_split_pat = -1;
+		float best_split_dist = 0.0f;
+		float best_split_var = 0.0f;
+
+		basisu::vector< std::pair<float, uint32_t> > dists;
+		dists.reserve(n);
+		
+		float_vec float_dists;
+		float_dists.reserve(n);
+				
+		for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++)
+		{
+			const uint32_t split_pat_index = pat_indices[pat_indices_iter];
+			assert(split_pat_index < num_unique_pats);
+
+			const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index];
+		
+			dists.resize(0);
+			float_dists.resize(0);
+
+			for (uint32_t j = 0; j < n; j++)
+			{
+				const uint32_t pat_index = pat_indices[j];
+				assert(pat_index < num_unique_pats);
+
+				if (pat_index == split_pat_index)
+					continue;
+				
+				float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
+				dists.emplace_back(std::pair(dist, pat_index));
+
+				float_dists.push_back(dist);
+			}
+
+			stats<double> s;
+			s.calc(float_dists.size_u32(), float_dists.data());
+
+			std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) {
+				return a.first < b.first;
+				});
+
+			const uint32_t num_dists = dists.size_u32();
+			float split_dist = dists[num_dists / 2].first;
+			if ((num_dists & 1) == 0)
+				split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f;
+
+			uint32_t total_inner = 0, total_outer = 0;
+			
+			for (uint32_t j = 0; j < n; j++)
+			{
+				const uint32_t pat_index = pat_indices[j];
+				if (pat_index == split_pat_index)
+					continue;
+				
+				float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
+
+				if (dist <= split_dist)
+					total_inner++;
+				else
+					total_outer++;
+			}
+
+			float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer);
+			
+			if ( (split_metric > best_split_metric) ||
+				 ((split_metric == best_split_metric) && (s.m_var > best_split_var)) )
+			{
+				best_split_metric = split_metric;
+				best_split_dist = split_dist;
+				best_split_pat = split_pat_index;
+				best_split_var = (float)s.m_var;
+			}
+		}
+
+		return std::pair(best_split_pat, best_split_dist);
+	}
+};
+
+struct partition
+{
+	uint64_t m_p;
+
+	inline partition() : 
+		m_p(0)
+	{
+	}
+
+	inline partition(uint64_t p) :
+		m_p(p)
+	{
+		assert(p < (1ULL << 36));
+	}
+
+	inline partition& operator=(uint64_t p)
+	{
+		assert(p < (1ULL << 36));
+		m_p = p;
+		return *this;
+	}
+
+	inline bool operator< (const partition& p) const
+	{
+		return m_p < p.m_p;
+	}
+
+	inline bool operator== (const partition& p) const
+	{
+		return m_p == p.m_p;
+	}
+
+	inline operator size_t() const
+	{
+		return hash_hsieh((const uint8_t *)&m_p, sizeof(m_p));
+	}
+};
+
+partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2];
+int g_part2_seed_to_unique_index[1024];
+vp_tree g_part2_vp_tree;
+
+static inline vec3F vec3F_norm_approx(vec3F axis)
+{
+	float l = axis.norm();
+	axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f);
+	return axis;
+}
+
+static void init_partitions2_6x6()
+{
+#if 0
+	// makes pattern bits to the 10-bit ASTC seed index
+	typedef basisu::hash_map<uint64_t, uint32_t> partition2_hash_map;
+	partition2_hash_map phash;
+	phash.reserve(1024);
+
+	for (uint32_t i = 0; i < 1024; i++)
+	{
+		uint64_t p_bits = 0;
+		uint64_t p_bits_inv = 0;
+				
+		for (uint32_t y = 0; y < 6; y++)
+		{
+			for (uint32_t x = 0; x < 6; x++)
+			{
+				uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false);
+				assert(p < 2);
+								
+				p_bits |= (p << (x + y * 6));
+				p_bits_inv |= ((1 - p) << (x + y * 6));
+			}
+		}
+				
+		if (!p_bits)
+			continue;
+		if (p_bits == ((1ULL << 36) - 1))
+			continue;
+
+		assert(p_bits < (1ULL << 36));
+		assert(p_bits_inv < (1ULL << 36));
+
+		if (phash.contains(p_bits))
+		{
+		}
+		else if (phash.contains(p_bits_inv))
+		{
+		}
+		else
+		{
+			auto res = phash.insert(p_bits, i);
+			assert(res.second);
+			BASISU_NOTE_UNUSED(res);
+		}
+	}
+		
+	uint32_t num_unique_partitions2 = 0;
+		
+	for (const auto& r : phash)
+	{
+		assert(r.second < 1024);
+		
+		const uint32_t unique_index = num_unique_partitions2;
+		assert(unique_index < NUM_UNIQUE_PARTITIONS2);
+
+		partition_pattern_vec pat_vec;
+		for (uint32_t i = 0; i < 36; i++)
+			pat_vec[i] = (uint8_t)((r.first >> i) & 1);
+
+		g_partitions2[unique_index] = pat_vec;
+		
+		assert(g_part2_unique_index_to_seed[unique_index] == r.second);
+		g_part2_seed_to_unique_index[r.second] = unique_index;
+
+		num_unique_partitions2++;
+	}
+	assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2);
+#else
+	for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++)
+	{
+		const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index];
+		assert(seed_index < 1024);
+
+		assert(g_part2_seed_to_unique_index[seed_index] == 0);
+		g_part2_seed_to_unique_index[seed_index] = unique_index;
+
+		partition_pattern_vec& pat_vec = g_partitions2[unique_index];
+
+		for (uint32_t y = 0; y < 6; y++)
+		{
+			for (uint32_t x = 0; x < 6; x++)
+			{
+				uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false);
+				assert(p < 2);
+
+				pat_vec[x + y * 6] = p;
+			}
+		}
+	}
+#endif
+
+	g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2);
+}
+
+static bool estimate_partition2_6x6(
+	const basist::half_float pBlock_pixels_half[][3],
+	int* pBest_parts, uint32_t num_best_parts)
+{
+	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H;
+		
+	vec3F training_vecs[BLOCK_T], mean(0.0f);
+
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		vec3F& v = training_vecs[i];
+
+		v[0] = (float)pBlock_pixels_half[i][0];
+		v[1] = (float)pBlock_pixels_half[i][1];
+		v[2] = (float)pBlock_pixels_half[i][2];
+
+		mean += v;
+	}
+	mean *= (1.0f / (float)BLOCK_T);
+
+	vec3F max_vals(-BIG_FLOAT_VAL);
+
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		vec3F& v = training_vecs[i];
+		max_vals = vec3F::component_max(max_vals, v);
+	}
+
+	// Initialize principle axis approximation
+	vec3F axis(max_vals - mean);
+
+	// Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x).
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		axis = vec3F_norm_approx(axis);
+
+		vec3F color(training_vecs[i] - mean);
+
+		float d = color.dot(axis);
+
+		axis += color * d;
+	}
+
+	if (axis.norm() < SMALL_FLOAT_VAL)
+		axis.set(0.57735027f);
+	else
+		axis.normalize_in_place();
+
+#if BRUTE_FORCE_PART_SEARCH
+	int desired_parts[BLOCK_H][BLOCK_W]; // [y][x]
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		float proj = (training_vecs[i] - mean).dot(axis);
+
+		desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f;
+	}
+#else
+	partition_pattern_vec desired_part;
+
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		float proj = (training_vecs[i] - mean).dot(axis);
+
+		desired_part.m_parts[i] = proj < 0.0f;
+	}
+#endif
+	
+	//interval_timer tm;
+	//tm.start();
+	
+#if BRUTE_FORCE_PART_SEARCH
+	uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2];
+
+	for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++)
+	{
+		const partition_pattern_vec &pat_vec = g_partitions2[part_index];
+
+		int total_sim_non_inv = 0;
+		int total_sim_inv = 0;
+
+		for (uint32_t y = 0; y < BLOCK_H; y++)
+		{
+			for (uint32_t x = 0; x < BLOCK_W; x++)
+			{
+				int part = pat_vec[x + y * 6];
+
+				if (part == desired_parts[y][x])
+					total_sim_non_inv++;
+
+				if ((part ^ 1) == desired_parts[y][x])
+					total_sim_inv++;
+			}
+		}
+
+		int total_sim = maximum(total_sim_non_inv, total_sim_inv);
+
+		part_similarity[part_index] = (total_sim << 16) | part_index;
+
+	} // part_index;
+
+	std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2);
+
+	for (uint32_t i = 0; i < num_best_parts; i++)
+		pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF;
+#else
+	vp_tree::result_queue results;
+	results.reserve(num_best_parts);
+	g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts);
+
+	assert(results.get_size() == num_best_parts);
+
+	const auto& elements = results.get_elements();
+
+	for (uint32_t i = 0; i < results.get_size(); i++)
+		pBest_parts[i] = elements[1 + i].m_pat_index;
+#endif
+
+	//fmt_printf("{} ", tm.get_elapsed_ms());
+
+	return true;
+}
+
+const uint32_t MIN_REFINE_LEVEL = 0;
+
+static bool encode_block_2_subsets(
+	trial_result res[2],
+	uint32_t grid_w, uint32_t grid_h,
+	uint32_t cem,
+	uint32_t weights_ise_range, uint32_t endpoints_ise_range,
+	const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
+	astc_hdr_codec_base_options& coptions,
+	bool uber_mode_flag,
+	int unique_pat_index,
+	uint32_t comp_level,
+	opt_mode_t mode11_opt_mode,
+	bool refine_endpoints_flag)
+{
+	const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
+
+	res[0].m_valid = false;
+	res[1].m_valid = false;
+
+	const uint32_t BLOCK_W = 6, BLOCK_H = 6;
+
+	astc_helpers::log_astc_block best_log_blk;
+	clear_obj(best_log_blk);
+
+	best_log_blk.m_num_partitions = 2;
+	best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
+	best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
+	best_log_blk.m_grid_width = (uint8_t)grid_w;
+	best_log_blk.m_grid_height = (uint8_t)grid_h;
+
+	best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
+	best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;
+
+	partition_pattern_vec* pPat = &g_partitions2[unique_pat_index];
+	const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index];
+
+	vec4F part_pixels_q16[2][64];
+	half_vec3 part_half_pixels[2][64];
+	uint8_t part_pixel_index[2][64];
+	uint32_t part_total_pixels[2] = { 0 };
+
+	for (uint32_t y = 0; y < BLOCK_H; y++)
+	{
+		for (uint32_t x = 0; x < BLOCK_W; x++)
+		{
+			uint32_t part_index = (*pPat)[x + y * BLOCK_W];
+
+			uint32_t l = part_total_pixels[part_index];
+
+			part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
+			part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
+			part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);
+
+			part_total_pixels[part_index] = l + 1;
+		} // x 
+	} // y
+
+	uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS];
+	uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
+	uint32_t best_submode[2];
+
+	for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
+	{
+		assert(part_total_pixels[part_iter]);
+
+		double e;
+		if (cem == 7)
+		{
+			e = encode_astc_hdr_block_mode_7(
+				part_total_pixels[part_iter],
+				(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
+				best_log_blk.m_weight_ise_range,
+				best_submode[part_iter],
+				BIG_FLOAT_VAL,
+				blk_endpoints[part_iter],
+				blk_weights[part_iter],
+				coptions,
+				best_log_blk.m_endpoint_ise_range);
+		}
+		else
+		{
+			assert(cem == 11);
+
+			e = encode_astc_hdr_block_mode_11(
+				part_total_pixels[part_iter],
+				(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
+				best_log_blk.m_weight_ise_range,
+				best_submode[part_iter],
+				BIG_FLOAT_VAL,
+				blk_endpoints[part_iter],
+				blk_weights[part_iter],
+				coptions,
+				false,
+				best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
+				mode11_opt_mode);
+		}
+
+		if (e == BIG_FLOAT_VAL)
+			return false;
+
+	} // part_iter
+
+	uint8_t ise_weights[BLOCK_W * BLOCK_H];
+
+	uint32_t src_pixel_index[2] = { 0, 0 };
+	for (uint32_t y = 0; y < BLOCK_H; y++)
+	{
+		for (uint32_t x = 0; x < BLOCK_W; x++)
+		{
+			uint32_t part_index = (*pPat)[x + y * BLOCK_W];
+			ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
+			src_pixel_index[part_index]++;
+		} // x
+	} // y
+
+	if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
+	{
+		best_log_blk.m_partition_id = (uint16_t)p_seed;
+
+		memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
+		memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
+		memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
+
+		res[0].m_valid = true;
+		res[0].m_log_blk = best_log_blk;
+	}
+	else
+	{
+		uint8_t desired_weights[BLOCK_H * BLOCK_W];
+
+		const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;
+
+		for (uint32_t by = 0; by < BLOCK_H; by++)
+			for (uint32_t bx = 0; bx < BLOCK_W; bx++)
+				desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];
+
+		uint8_t downsampled_weights[BLOCK_H * BLOCK_W];
+
+		const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
+		if (!pDownsample_matrix)
+		{
+			assert(0);
+			return false;
+		}
+
+		downsample_weight_grid(
+			pDownsample_matrix,
+			BLOCK_W, BLOCK_H,		// source/from dimension (block size)
+			grid_w, grid_h,			// dest/to dimension (grid size)
+			desired_weights,		// these are dequantized weights, NOT ISE symbols, [by][bx]
+			downsampled_weights);	// [wy][wx]
+				
+		best_log_blk.m_partition_id = (uint16_t)p_seed;
+		memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
+		memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
+
+		const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;
+
+		for (uint32_t gy = 0; gy < grid_h; gy++)
+			for (uint32_t gx = 0; gx < grid_w; gx++)
+				best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];
+
+		res[0].m_valid = true;
+		res[0].m_log_blk = best_log_blk;
+
+		if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
+		{
+			bool any_refined = false;
+
+			for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
+			{
+				bool refine_status = refine_endpoints(
+					cem,
+					endpoints_ise_range,
+					best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
+					BLOCK_W, BLOCK_H, // block dimensions
+					grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid
+					part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
+					&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
+					coptions, mode11_opt_mode);
+
+				if (refine_status)
+					any_refined = true;
+			}
+
+			if (any_refined)
+			{
+				res[1].m_valid = true;
+				res[1].m_log_blk = best_log_blk;
+			}
+		}
+	}
+
+	return true;
+}
+
+typedef basisu::hash_map<partition_pattern_vec, std::pair<uint32_t, uint32_t > > partition3_hash_map;
+
+partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3];
+int g_part3_seed_to_unique_index[1024];
+vp_tree g_part3_vp_tree;
+
+static void init_partitions3_6x6()
+{
+	uint32_t t = 0;
+
+	for (uint32_t i = 0; i < 1024; i++)
+		g_part3_seed_to_unique_index[i] = -1;
+
+	partition3_hash_map part3_hash;
+	part3_hash.reserve(512);
+		
+	for (uint32_t seed_index = 0; seed_index < 1024; seed_index++)
+	{
+		partition_pattern_vec p3;
+		uint32_t part_hist[3] = { 0 };
+
+		for (uint32_t y = 0; y < 6; y++)
+		{
+			for (uint32_t x = 0; x < 6; x++)
+			{
+				uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false);
+				assert(p < 3);
+
+				p3.m_parts[x + y * 6] = (uint8_t)p;
+				part_hist[p]++;
+			}
+		}
+
+		if (!part_hist[0] || !part_hist[1] || !part_hist[2])
+			continue;
+
+		uint32_t j;
+		for (j = 0; j < NUM_PART3_MAPPINGS; j++)
+		{
+			partition_pattern_vec temp_part3(p3.get_permuted3(j));
+
+			if (part3_hash.contains(temp_part3))
+				break;
+		}
+		if (j < NUM_PART3_MAPPINGS)
+			continue;
+
+		part3_hash.insert(p3, std::make_pair(seed_index, t) );
+
+		assert(g_part3_unique_index_to_seed[t] == seed_index);
+		g_part3_seed_to_unique_index[seed_index] = t;
+		g_partitions3[t] = p3;
+
+		t++;
+	}
+
+	g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3);
+}
+
+static bool estimate_partition3_6x6(
+	const basist::half_float pBlock_pixels_half[][3],
+	int* pBest_parts, uint32_t num_best_parts)
+{
+	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3;
+
+	assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3));
+
+	vec3F training_vecs[BLOCK_T], mean(0.0f);
+
+	float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL;
+	vec3F cluster_centroids[NUM_SUBSETS];
+
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		vec3F& v = training_vecs[i];
+
+		v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]);
+
+		float inten = v.dot(vec3F(1.0f));
+		if (inten < darkest_inten)
+		{
+			darkest_inten = inten;
+			cluster_centroids[0] = v;
+		}
+
+		if (inten > brightest_inten)
+		{
+			brightest_inten = inten;
+			cluster_centroids[1] = v;
+		}
+	}
+
+	if (cluster_centroids[0] == cluster_centroids[1])
+		return false;
+
+	float furthest_dist2 = 0.0f;
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		vec3F& v = training_vecs[i];
+
+		float dist_a = v.squared_distance(cluster_centroids[0]);
+		if (dist_a == 0.0f)
+			continue;
+
+		float dist_b = v.squared_distance(cluster_centroids[1]);
+		if (dist_b == 0.0f)
+			continue;
+
+		float dist2 = dist_a + dist_b;
+		if (dist2 > furthest_dist2)
+		{
+			furthest_dist2 = dist2;
+			cluster_centroids[2] = v;
+		}
+	}
+
+	if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2]))
+		return false;
+		
+	uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T];
+	uint32_t num_cluster_pixels[NUM_SUBSETS];
+	vec3F new_cluster_means[NUM_SUBSETS];
+
+	const uint32_t NUM_ITERS = 4;
+	
+	for (uint32_t s = 0; s < NUM_ITERS; s++)
+	{
+		memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels));
+		memset(new_cluster_means, 0, sizeof(new_cluster_means));
+
+		for (uint32_t i = 0; i < BLOCK_T; i++)
+		{
+			float d[NUM_SUBSETS] = { 
+				training_vecs[i].squared_distance(cluster_centroids[0]), 
+				training_vecs[i].squared_distance(cluster_centroids[1]), 
+				training_vecs[i].squared_distance(cluster_centroids[2]) };
+
+			float min_d = d[0];
+			uint32_t min_idx = 0;
+			for (uint32_t j = 1; j < NUM_SUBSETS; j++)
+			{
+				if (d[j] < min_d)
+				{
+					min_d = d[j];
+					min_idx = j;
+				}
+			}
+
+			cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i;
+			new_cluster_means[min_idx] += training_vecs[i];
+			num_cluster_pixels[min_idx]++;
+		} // i
+
+		for (uint32_t j = 0; j < NUM_SUBSETS; j++)
+		{
+			if (!num_cluster_pixels[j])
+				return false;
+
+			cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j];
+		}
+	} // s
+		
+	partition_pattern_vec desired_part;
+	for (uint32_t p = 0; p < NUM_SUBSETS; p++)
+	{
+		for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
+		{
+			const uint32_t pix_index = cluster_pixels[p][i];
+			desired_part[pix_index] = (uint8_t)p;
+		}
+	}
+
+#if BRUTE_FORCE_PART_SEARCH
+	partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS];
+	for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++)
+		desired_parts[j] = desired_part.get_permuted3(j);
+
+	uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3];
+
+	for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++)
+	{
+		const partition_pattern_vec& pat = g_partitions3[part_index];
+
+		uint32_t lowest_pat_dist = UINT32_MAX;
+		for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++)
+		{
+			uint32_t dist = pat.get_squared_distance(desired_parts[p]);
+			if (dist < lowest_pat_dist)
+				lowest_pat_dist = dist;
+		}
+
+		part_similarity[part_index] = (lowest_pat_dist << 16) | part_index;
+
+	} // part_index;
+
+	std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3);
+		
+	for (uint32_t i = 0; i < num_best_parts; i++)
+		pBest_parts[i] = part_similarity[i] & 0xFFFF;
+#else
+	vp_tree::result_queue results;
+	results.reserve(num_best_parts);
+	g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts);
+
+	assert(results.get_size() == num_best_parts);
+
+	const auto& elements = results.get_elements();
+
+	for (uint32_t i = 0; i < results.get_size(); i++)
+		pBest_parts[i] = elements[1 + i].m_pat_index;
+#endif
+
+	return true;
+}
+
+static bool encode_block_3_subsets(
+	trial_result& res,
+	uint32_t cem,
+	uint32_t grid_w, uint32_t grid_h,
+	uint32_t weights_ise_range, uint32_t endpoints_ise_range,
+	const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
+	astc_hdr_codec_base_options& coptions,
+	bool uber_mode_flag,
+	const int* pEst_patterns, int num_est_patterns,
+	uint32_t comp_level, 
+	opt_mode_t mode11_opt_mode)
+{
+	BASISU_NOTE_UNUSED(uber_mode_flag);
+	const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3;
+	const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem);
+		
+	res.m_valid = false;
+		
+	double best_e = BIG_FLOAT_VAL;
+
+	astc_helpers::log_astc_block best_log_blk;
+	clear_obj(best_log_blk);
+
+	best_log_blk.m_num_partitions = NUM_SUBSETS;
+	best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
+	best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
+	best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem;
+	best_log_blk.m_grid_width = (uint8_t)grid_w;
+	best_log_blk.m_grid_height = (uint8_t)grid_h;
+
+	best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
+	best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;
+
+	const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3;
+
+	for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++)
+	{
+		const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter;
+		assert(unique_part_index < NUM_UNIQUE_PARTITIONS3);
+		const partition_pattern_vec*pPart = &g_partitions3[unique_part_index];
+
+		vec4F part_pixels_q16[NUM_SUBSETS][64];
+		half_vec3 part_half_pixels[NUM_SUBSETS][64];
+		uint8_t part_pixel_index[NUM_SUBSETS][64];
+		uint32_t part_total_pixels[NUM_SUBSETS] = { 0 };
+
+		for (uint32_t y = 0; y < BLOCK_H; y++)
+		{
+			for (uint32_t x = 0; x < BLOCK_W; x++)
+			{
+				const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];
+
+				uint32_t l = part_total_pixels[part_index];
+
+				part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
+				part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
+				part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);
+
+				part_total_pixels[part_index] = l + 1;
+			} // x 
+		} // y
+
+		uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS];
+		uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H];
+		uint32_t best_submode[NUM_SUBSETS];
+
+		double e = 0.0f;
+		for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
+		{
+			assert(part_total_pixels[part_iter]);
+
+			if (cem == 7)
+			{
+				e += encode_astc_hdr_block_mode_7(
+					part_total_pixels[part_iter],
+					(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
+					best_log_blk.m_weight_ise_range,
+					best_submode[part_iter],
+					BIG_FLOAT_VAL,
+					blk_endpoints[part_iter],
+					blk_weights[part_iter],
+					coptions,
+					best_log_blk.m_endpoint_ise_range);
+			}
+			else
+			{
+				assert(cem == 11);
+
+				e += encode_astc_hdr_block_mode_11(
+					part_total_pixels[part_iter],
+					(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
+					best_log_blk.m_weight_ise_range,
+					best_submode[part_iter],
+					BIG_FLOAT_VAL,
+					blk_endpoints[part_iter],
+					blk_weights[part_iter],
+					coptions,
+					false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, 
+					FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode);
+			}
+
+		} // part_iter
+
+		uint8_t ise_weights[BLOCK_W * BLOCK_H];
+
+		uint32_t src_pixel_index[NUM_SUBSETS] = { 0 };
+		for (uint32_t y = 0; y < BLOCK_H; y++)
+		{
+			for (uint32_t x = 0; x < BLOCK_W; x++)
+			{
+				const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];
+
+				ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
+				src_pixel_index[part_index]++;
+			} // x
+		} // y
+
+		if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
+		{
+			if (e < best_e)
+			{
+				best_e = e;
+				best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
+
+				for (uint32_t p = 0; p < NUM_SUBSETS; p++)
+					memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
+				
+				memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
+			}
+		}
+		else
+		{
+			uint8_t desired_weights[BLOCK_H * BLOCK_W];
+
+			const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;
+
+			for (uint32_t by = 0; by < BLOCK_H; by++)
+				for (uint32_t bx = 0; bx < BLOCK_W; bx++)
+					desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];
+
+			uint8_t downsampled_weights[BLOCK_H * BLOCK_W];
+
+			const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
+			if (!pDownsample_matrix)
+			{
+				assert(0);
+				return false;
+			}
+
+			downsample_weight_grid(
+				pDownsample_matrix,
+				BLOCK_W, BLOCK_H,		// source/from dimension (block size)
+				grid_w, grid_h,			// dest/to dimension (grid size)
+				desired_weights,		// these are dequantized weights, NOT ISE symbols, [by][bx]
+				downsampled_weights);	// [wy][wx]
+
+			astc_helpers::log_astc_block trial_blk(best_log_blk);
+
+			trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
+			
+			for (uint32_t p = 0; p < NUM_SUBSETS; p++)
+				memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
+
+			const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;
+
+			for (uint32_t gy = 0; gy < grid_h; gy++)
+				for (uint32_t gx = 0; gx < grid_w; gx++)
+					trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];
+
+			if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
+			{
+				for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
+				{
+					bool refine_status = refine_endpoints(
+						cem,
+						endpoints_ise_range,
+						trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
+						BLOCK_W, BLOCK_H, // block dimensions
+						grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid
+						part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
+						&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
+						coptions, mode11_opt_mode);
+
+					BASISU_NOTE_UNUSED(refine_status);
+				}
+			}
+
+			half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x]
+			bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16);
+			assert(status);
+			if (!status)
+				return false;
+
+			half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W];
+			for (uint32_t y = 0; y < BLOCK_H; y++)
+				for (uint32_t x = 0; x < BLOCK_W; x++)
+					decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]);
+
+			double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions);
+			if (trial_err < best_e)
+			{
+				best_e = trial_err;
+				best_log_blk = trial_blk;
+			}
+		}
+
+	} // unique_p_iter
+
+	if (best_e < BIG_FLOAT_VAL)
+	{
+		res.m_log_blk = best_log_blk;
+		res.m_valid = true;
+		res.m_err = best_e;
+	}
+	else
+	{
+		res.m_valid = false;
+	}
+
+	return res.m_valid;
+}
+
+static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range)
+{
+	const uint32_t MAX_VALS = 64;
+	uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3];
+	uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1;
+
+	assert((total_values) && (total_values <= MAX_VALS));
+	
+	const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0];
+	const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1];
+	const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2];
+
+	for (uint32_t i = 0; i < total_values; i++)
+	{
+		uint32_t val = pVals[i];
+
+		uint32_t bits = val & ((1 << ep_bits) - 1);
+		uint32_t tq = val >> ep_bits;
+
+		bit_values[i] = bits;
+
+		if (ep_trits)
+		{
+			assert(tq < 3);
+			tq_accum += tq * tq_mul;
+			tq_mul *= 3;
+			if (tq_mul == 243)
+			{
+				assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
+				tq_values[total_tq_values++] = tq_accum;
+				tq_accum = 0;
+				tq_mul = 1;
+			}
+		}
+		else if (ep_quints)
+		{
+			assert(tq < 5);
+			tq_accum += tq * tq_mul;
+			tq_mul *= 5;
+			if (tq_mul == 125)
+			{
+				assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
+				tq_values[total_tq_values++] = tq_accum;
+				tq_accum = 0;
+				tq_mul = 1;
+			}
+		}
+	}
+
+	uint32_t total_bits_output = 0;
+	
+	for (uint32_t i = 0; i < total_tq_values; i++)
+	{
+		const uint32_t num_bits = ep_trits ? 8 : 7;
+		coder.put_bits(tq_values[i], num_bits);
+		total_bits_output += num_bits;
+	}
+
+	if (tq_mul > 1)
+	{
+		uint32_t num_bits;
+		if (ep_trits)
+		{
+			if (tq_mul == 3)
+				num_bits = 2;
+			else if (tq_mul == 9)
+				num_bits = 4;
+			else if (tq_mul == 27)
+				num_bits = 5;
+			else //if (tq_mul == 81)
+				num_bits = 7;
+		}
+		else
+		{
+			if (tq_mul == 5)
+				num_bits = 3;
+			else //if (tq_mul == 25)
+				num_bits = 5;
+		}
+		coder.put_bits(tq_accum, num_bits);
+		total_bits_output += num_bits;
+	}
+
+	for (uint32_t i = 0; i < total_values; i++)
+	{
+		coder.put_bits(bit_values[i], ep_bits);
+		total_bits_output += ep_bits;
+	}
+
+	return total_bits_output;
+}
+
+static inline uint32_t get_num_endpoint_vals(uint32_t cem)
+{
+	assert((cem == 7) || (cem == 11));
+	return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
+}
+
+static void code_block(bitwise_coder& coder,
+	const astc_helpers::log_astc_block& log_blk,
+	block_mode block_mode_index,
+	endpoint_mode em, const uint8_t *pEP_deltas)
+{
+	coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes);
+	coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal);
+
+	const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]);
+
+	if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta))
+	{
+		assert(log_blk.m_num_partitions == 1);
+
+		for (uint32_t i = 0; i < num_endpoint_vals; i++)
+			coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS);
+	}
+	else if (em == endpoint_mode::cRaw)
+	{
+		if (log_blk.m_num_partitions == 2)
+		{
+			const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id];
+			assert(unique_partition_index != -1);
+			
+			coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2);
+		}
+		else if (log_blk.m_num_partitions == 3)
+		{
+			const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id];
+			assert(unique_partition_index != -1);
+
+			coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3);
+		}
+		
+		encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range);
+	}
+
+	encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range);
+}
+
+struct smooth_map_params
+{
+	bool m_no_mse_scaling;
+
+	float m_max_smooth_std_dev;
+	float m_smooth_max_mse_scale;
+
+	float m_max_med_smooth_std_dev;
+	float m_med_smooth_max_mse_scale;
+
+	float m_max_ultra_smooth_std_dev;
+	float m_ultra_smooth_max_mse_scale;
+
+	bool m_debug_images;
+
+	smooth_map_params()
+	{
+		clear();
+	}
+
+	void clear()
+	{
+		m_no_mse_scaling = false;
+
+		// 3x3 region
+		m_max_smooth_std_dev = 100.0f;
+		m_smooth_max_mse_scale = 13000.0f;
+				
+		// 7x7 region
+		m_max_med_smooth_std_dev = 9.0f;
+		m_med_smooth_max_mse_scale = 15000.0f;
+
+		// 11x11 region
+		m_max_ultra_smooth_std_dev = 4.0f;
+		//m_ultra_smooth_max_mse_scale = 4500.0f;
+		//m_ultra_smooth_max_mse_scale = 10000.0f;
+		//m_ultra_smooth_max_mse_scale = 50000.0f;
+		//m_ultra_smooth_max_mse_scale = 100000.0f;
+		//m_ultra_smooth_max_mse_scale = 400000.0f;
+		//m_ultra_smooth_max_mse_scale = 800000.0f;
+		m_ultra_smooth_max_mse_scale = 2000000.0f;
+
+		m_debug_images = true;
+	}
+};
+
+Resampler::Contrib_List* g_contrib_lists[7]; // 1-6
+
+static void init_contrib_lists()
+{
+	for (uint32_t dst_width = 1; dst_width <= 6; dst_width++)
+		//g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
+		g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
+}
+
+#if 0
+static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16)
+{
+	vec3F temp_block[6][6]; // [y][x]
+
+	// first filter rows to temp_block
+	if (grid_x == 6)
+	{
+		memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
+	}
+	else
+	{
+		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
+
+		for (uint32_t y = 0; y < 6; y++)
+		{
+			for (uint32_t x = 0; x < 6; x++)
+			{
+				vec3F p(0.0f);
+
+				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
+					p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight;
+
+				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
+
+				temp_block[y][x] = p;
+			} // x
+		} // y
+	}
+
+	// filter columns
+	if (grid_y == 6)
+	{
+		for (uint32_t y = 0; y < 6; y++)
+		{
+			for (uint32_t x = 0; x < 6; x++)
+			{
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					const basist::half_float h = basist::float_to_half(temp_block[y][x][c]);
+					
+					pDst_block_half3[x + y * 6][c] = h;
+					pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
+				}
+
+				pDst_block_q16[x + y * 6][3] = 0.0f;
+			} // x
+		} // y
+	}
+	else
+	{
+		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
+
+		for (uint32_t x = 0; x < 6; x++)
+		{
+			for (uint32_t y = 0; y < 6; y++)
+			{
+				vec3F p(0.0f);
+
+				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
+					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
+				
+				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
+				
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					const basist::half_float h = basist::float_to_half(p[c]);
+
+					pDst_block_half3[x + y * 6][c] = h;
+					pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
+				}
+
+				pDst_block_q16[x + y * 6][3] = 0.0f;
+				
+			} // x
+		} // y
+	}
+}
+#endif
+
+static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block)
+{
+	vec4F temp_block[6][6]; // [y][x]
+
+	// first filter rows to temp_block
+	if (grid_x == 6)
+	{
+		memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6);
+	}
+	else
+	{
+		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
+
+		for (uint32_t y = 0; y < 6; y++)
+		{
+			for (uint32_t x = 0; x < 6; x++)
+			{
+				vec3F p(0.0f);
+
+				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
+					p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
+
+				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
+
+				temp_block[y][x] = p;
+			} // x
+		} // y
+	}
+
+	// filter columns
+	if (grid_y == 6)
+	{
+		for (uint32_t y = 0; y < 6; y++)
+		{
+			for (uint32_t x = 0; x < 6; x++)
+			{
+				for (uint32_t c = 0; c < 3; c++)
+					pDst_block[x + y * 6][c] = temp_block[y][x][c];
+			} // x
+		} // y
+	}
+	else
+	{
+		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
+
+		for (uint32_t x = 0; x < 6; x++)
+		{
+			for (uint32_t y = 0; y < 6; y++)
+			{
+				vec3F p(0.0f);
+
+				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
+					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
+
+				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
+
+				pDst_block[x + y * 6] = p;
+
+			} // x
+		} // y
+	}
+}
+
+static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block)
+{
+	vec3F temp_block[6][6]; // [y][x]
+
+	// first filter rows to temp_block
+	if (grid_x == 6)
+	{
+		memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
+	}
+	else
+	{
+		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
+
+		for (uint32_t y = 0; y < 6; y++)
+		{
+			for (uint32_t x = 0; x < 6; x++)
+			{
+				vec3F p(0.0f);
+
+				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
+					p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
+								
+				temp_block[y][x] = p;
+			} // x
+		} // y
+	}
+
+	// filter columns
+	if (grid_y == 6)
+	{
+		memcpy(pDst_block, temp_block, sizeof(vec3F) * 6 * 6);
+	}
+	else
+	{
+		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
+
+		for (uint32_t x = 0; x < 6; x++)
+		{
+			for (uint32_t y = 0; y < 6; y++)
+			{
+				vec3F& p = pDst_block[x + y * 6];
+				p.set(0.0f);
+
+				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
+					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
+			} // x
+		} // y
+	}
+}
+
+static float diff_blocks(const vec4F* pA, const vec4F* pB)
+{
+	const uint32_t BLOCK_T = 36;
+
+	float diff = 0.0f;
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+		diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]);
+	
+	return diff * (1.0f / (float)BLOCK_T);
+}
+
+static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB)
+{
+	const uint32_t BLOCK_T = 36;
+
+	vec3F mean(0.0f);
+
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		vec3F diff(pA[i] - pB[i]);
+		mean += diff;
+	}
+
+	mean *= (1.0f / (float)BLOCK_T);
+
+	vec3F diff_sum(0.0f);
+	for (uint32_t i = 0; i < BLOCK_T; i++)
+	{
+		vec3F diff(pA[i] - pB[i]);
+		diff -= mean;
+		diff_sum += vec3F::component_mul(diff, diff);
+	}
+
+	vec3F var(diff_sum * (1.0f / (float)BLOCK_T));
+
+	vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2]));
+
+	return maximum(std_dev[0], std_dev[1], std_dev[2]);
+}
+
+static void create_smooth_maps2(
+	vector2D<float>& smooth_block_mse_scales,
+	const image& orig_img,
+	smooth_map_params& params, image* pUltra_smooth_img = nullptr)
+{
+	const uint32_t width = orig_img.get_width();
+	const uint32_t height = orig_img.get_height();
+	//const uint32_t total_pixels = orig_img.get_total_pixels();
+	const uint32_t num_comps = 3;
+
+	if (params.m_no_mse_scaling)
+	{
+		smooth_block_mse_scales.set_all(1.0f);
+		return;
+	}
+
+	smooth_block_mse_scales.resize(width, height);
+
+	image smooth_vis, med_smooth_vis, ultra_smooth_vis;
+
+	if (params.m_debug_images)
+	{
+		smooth_vis.resize(width, height);
+		med_smooth_vis.resize(width, height);
+		ultra_smooth_vis.resize(width, height);
+	}
+
+	for (uint32_t y = 0; y < height; y++)
+	{
+		for (uint32_t x = 0; x < width; x++)
+		{
+			{
+				tracked_stat_dbl comp_stats[4];
+				for (int yd = -1; yd <= 1; yd++)
+				{
+					for (int xd = -1; xd <= 1; xd++)
+					{
+						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
+
+						comp_stats[0].update((float)p[0]);
+						comp_stats[1].update((float)p[1]);
+						comp_stats[2].update((float)p[2]);
+					}
+				}
+
+				float max_std_dev = 0.0f;
+				for (uint32_t i = 0; i < num_comps; i++)
+					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
+
+				float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f);
+				//yl = powf(yl, 2.0f);
+				yl = powf(yl, 1.0f / 2.0f); // substantially less bits
+
+				smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl);
+
+				if (params.m_debug_images)
+				{
+					//smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255));
+					// white=high local activity (edges/detail)
+					// black=low local activity (smooth - error is amplified)
+					smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255));
+				}
+			}
+
+			{
+				tracked_stat_dbl comp_stats[4];
+
+				const int S = 3;
+				for (int yd = -S; yd < S; yd++)
+				{
+					for (int xd = -S; xd < S; xd++)
+					{
+						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
+
+						comp_stats[0].update((float)p[0]);
+						comp_stats[1].update((float)p[1]);
+						comp_stats[2].update((float)p[2]);
+					}
+				}
+
+				float max_std_dev = 0.0f;
+				for (uint32_t i = 0; i < num_comps; i++)
+					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
+
+				float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f);
+				//yl = powf(yl, 2.0f);
+
+				smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
+
+				if (params.m_debug_images)
+					med_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
+			}
+
+			{
+				tracked_stat_dbl comp_stats[4];
+
+				const int S = 5;
+				for (int yd = -S; yd < S; yd++)
+				{
+					for (int xd = -S; xd < S; xd++)
+					{
+						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
+
+						comp_stats[0].update((float)p[0]);
+						comp_stats[1].update((float)p[1]);
+						comp_stats[2].update((float)p[2]);
+					}
+				}
+
+				float max_std_dev = 0.0f;
+				for (uint32_t i = 0; i < num_comps; i++)
+					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
+
+				float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f);
+				yl = powf(yl, 2.0f);
+				
+				smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
+
+				if (params.m_debug_images)
+					ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
+			}
+
+		}
+	}
+
+	if (params.m_debug_images)
+	{
+		save_png("dbg_smooth_vis.png", smooth_vis);
+		save_png("dbg_med_smooth_vis.png", med_smooth_vis);
+		save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis);
+
+		image vis_img(width, height);
+
+		float max_scale = 0.0f;
+		for (uint32_t y = 0; y < height; y++)
+			for (uint32_t x = 0; x < width; x++)
+				max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y));
+
+		for (uint32_t y = 0; y < height; y++)
+			for (uint32_t x = 0; x < width; x++)
+				vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale));
+
+		save_png("scale_vis.png", vis_img);
+	}
+
+	if (pUltra_smooth_img)
+		*pUltra_smooth_img = ultra_smooth_vis;
+}
+
+const float REALLY_DARK_I_THRESHOLD = 0.0625f;
+const float REALLY_DARK_MSE_ERR_SCALE = 128.0f;
+const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f;
+
+static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment)
+{
+	float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0];
+	float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1];
+	float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2];
+		
+	float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p);
+
+	if (delta_itp_dark_adjustment)
+	{
+		// We have to process a large range of inputs, including extremely dark inputs. 
+		// Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas.
+		// This is to better handle very dark signals which could be explictly overexposed.
+		float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]);
+		s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s);
+		err *= s;
+	}
+
+	return err;
+}
+
+static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment)
+{
+	float total_mse = 0.0f;
+
+	for (uint32_t y = 0; y < block_h; y++)
+	{
+		for (uint32_t x = 0; x < block_w; x++)
+		{
+			total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment);
+		} // x
+	} // y
+
+	return total_mse * (1.0f / (float)(block_w * block_h));
+}
+
+static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp)
+{
+	const uint32_t n = block_w * block_h;
+	assert(n <= 36);
+
+	stats<float> x_stats[3], y_stats[3];
+	comparative_stats<float> xy_cov[3];
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+		x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3);
+		y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3);
+	}
+
+	for (uint32_t c = 0; c < 3; c++)
+		xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]);
+
+	float ssim[3];
+	const double d = 1.0f, k1 = .01f, k2 = .03f;
+
+	// weight mean error more highly to reduce blocking
+	float ap = 1.5f, bp = 1.0f, cp = 1.0f;
+
+	const double s_c1 = square(k1 * d), s_c2 = square(k2 * d);
+	const double s_c3(s_c2 * .5f);
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+		float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1));
+		lum = saturate(lum);
+
+		float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2));
+		con = saturate(con);
+
+		float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3));
+		str = saturate(str);
+
+		ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp);
+	}
+
+#if 0
+	float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f);
+#elif 1
+	float final_ssim = ssim[0] * ssim[1] * ssim[2];
+#else
+	const float LP = .75f;
+	float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP);
+#endif
+
+	return final_ssim;
+}
+
+// delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light
+static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment)
+{
+	float delta_i = a[0] - b[0];
+	float delta_t = a[1] - b[1];
+	float delta_p = a[2] - b[2];
+
+	float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p));
+
+	float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]);
+	
+	if (delta_itp_dark_adjustment)
+	{
+		// This is to better handle very dark signals which could be explictly overexposed.
+		s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s);
+		err *= s;
+	}
+
+	return err;
+}
+
+struct candidate_encoding
+{
+	encoding_type m_encoding_type;
+		
+	basist::half_float m_solid_color[3];
+
+	uint32_t m_run_len;
+
+	vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
+	vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
+		
+	endpoint_mode m_endpoint_mode;
+	block_mode m_block_mode;
+
+	bitwise_coder m_coder;
+		
+	// The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC.
+	// Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type.
+	astc_helpers::log_astc_block m_coded_log_blk; 
+
+	// The block the decoder outputs.
+	astc_helpers::log_astc_block m_decomp_log_blk;
+
+	int m_reuse_delta_index;
+
+	float m_t, m_d, m_bits;
+					
+	candidate_encoding()
+	{
+		clear();
+	}
+
+	candidate_encoding(const candidate_encoding &other)
+	{
+		*this = other;
+	}
+
+	candidate_encoding(candidate_encoding&& other)
+	{
+		*this = std::move(other);
+	}
+
+	candidate_encoding& operator=(const candidate_encoding& rhs)
+	{
+		if (this == &rhs)
+			return *this;
+
+		m_encoding_type = rhs.m_encoding_type;
+		memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
+		m_run_len = rhs.m_run_len;
+		memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
+		m_endpoint_mode = rhs.m_endpoint_mode;
+		m_block_mode = rhs.m_block_mode;
+		m_coder = rhs.m_coder;
+		m_coded_log_blk = rhs.m_coded_log_blk;
+		m_decomp_log_blk = rhs.m_decomp_log_blk;
+		m_reuse_delta_index = rhs.m_reuse_delta_index;
+		
+		return *this;
+	}
+
+	candidate_encoding& operator=(candidate_encoding&& rhs)
+	{
+		if (this == &rhs)
+			return *this;
+
+		m_encoding_type = rhs.m_encoding_type;
+		memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
+		m_run_len = rhs.m_run_len;
+		memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
+		m_endpoint_mode = rhs.m_endpoint_mode;
+		m_block_mode = rhs.m_block_mode;
+		m_coder = std::move(rhs.m_coder);
+		m_coded_log_blk = rhs.m_coded_log_blk;
+		m_decomp_log_blk = rhs.m_decomp_log_blk;
+		m_reuse_delta_index = rhs.m_reuse_delta_index;
+
+		return *this;
+	}
+
+	void clear()
+	{
+		m_encoding_type = encoding_type::cInvalid;
+
+		clear_obj(m_solid_color);
+
+		m_run_len = 0;
+
+		clear_obj(m_comp_pixels);
+						
+		m_endpoint_mode = endpoint_mode::cInvalid;
+		m_block_mode = block_mode::cInvalid;
+
+		m_coder.restart();
+		
+		m_coded_log_blk.clear();
+		m_decomp_log_blk.clear();
+
+		m_t = 0;
+		m_d = 0;
+		m_bits = 0;
+		
+		m_reuse_delta_index = 0;
+	}
+};
+
+bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels)
+{
+	assert((block_w <= 6) && (block_h <= 6));
+
+	half_vec4 decoded_pixels_half4[6 * 6]; // [y][x]
+	bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16);
+	assert(status);
+
+	if (!status)
+		return false;
+
+	for (uint32_t y = 0; y < block_h; y++)
+	{
+		for (uint32_t x = 0; x < block_w; x++)
+		{
+			pPixels[x + y * block_w].set(
+				basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]),
+				basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]),
+				basist::half_to_float(decoded_pixels_half4[x + y * block_w][2]));
+		} // x 
+	} //y
+
+	return true;
+}
+
+static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk)
+{
+	astc_helpers::astc_block phys_blk;
+	return astc_helpers::pack_astc_block(phys_blk, decomp_blk);
+}
+
+#define SYNC_MARKERS (0)
+
+static bool decode_file(const uint8_vec& comp_data, vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t &width, uint32_t &height)
+{
+	interval_timer tm;
+	tm.start();
+
+	const uint32_t BLOCK_W = 6, BLOCK_H = 6;
+
+	width = 0;
+	height = 0;
+
+	if (comp_data.size() <= 2*3)
+		return false;
+
+	basist::bitwise_decoder decoder;
+	if (!decoder.init(comp_data.data(), comp_data.size_u32()))
+		return false;
+
+	if (decoder.get_bits(16) != 0xABCD)
+		return false;
+
+	width = decoder.get_bits(16);
+	height = decoder.get_bits(16);
+		
+	if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM))
+		return false;
+
+	const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W;
+	const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H;
+	const uint32_t total_blocks = num_blocks_x * num_blocks_y;
+
+	decoded_blocks.resize(num_blocks_x, num_blocks_y);
+	//memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes());
+
+	vector2D<astc_helpers::log_astc_block> decoded_log_blocks(num_blocks_x, num_blocks_y);
+	//memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes());
+
+	uint32_t cur_bx = 0, cur_by = 0;
+	uint32_t step_counter = 0;
+	BASISU_NOTE_UNUSED(step_counter);
+		
+	while (cur_by < num_blocks_y)
+	{
+		step_counter++;
+		
+		//if ((cur_bx == 9) && (cur_by == 13))
+		//	printf("!");
+
+#if SYNC_MARKERS
+		uint32_t mk = decoder.get_bits(16);
+		if (mk != 0xDEAD)
+		{
+			printf("!");
+			assert(0);
+			return false;
+		}
+#endif
+		if (decoder.get_bits_remaining() < 1)
+			return false;
+
+		encoding_type et = encoding_type::cBlock;
+
+		uint32_t b0 = decoder.get_bits(1);
+		if (!b0)
+		{
+			uint32_t b1 = decoder.get_bits(1);
+			if (b1)
+				et = encoding_type::cReuse;
+			else
+			{
+				uint32_t b2 = decoder.get_bits(1);
+				if (b2)
+					et = encoding_type::cSolid;
+				else
+					et = encoding_type::cRun;
+			}
+		}
+
+		switch (et)
+		{
+		case encoding_type::cRun:
+		{
+			if (!cur_bx && !cur_by)
+				return false;
+
+			const uint32_t run_len = decoder.decode_vlc(5) + 1;
+			
+			uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x);
+			if (run_len > num_blocks_remaining)
+				return false;
+						
+			uint32_t prev_bx = cur_bx, prev_by = cur_by;
+
+			if (cur_bx)
+				prev_bx--;
+			else
+			{
+				prev_bx = num_blocks_x - 1;
+				prev_by--;
+			}
+
+			const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
+			const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);
+
+			for (uint32_t i = 0; i < run_len; i++)
+			{
+				decoded_log_blocks(cur_bx, cur_by) = prev_log_blk;
+				decoded_blocks(cur_bx, cur_by) = prev_phys_blk;
+
+				cur_bx++;
+				if (cur_bx == num_blocks_x)
+				{
+					cur_bx = 0;
+					cur_by++;
+				}
+			}
+
+			break;
+		}
+		case encoding_type::cSolid:
+		{
+			const basist::half_float rh = (basist::half_float)decoder.get_bits(15);
+			const basist::half_float gh = (basist::half_float)decoder.get_bits(15);
+			const basist::half_float bh = (basist::half_float)decoder.get_bits(15);
+
+			astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
+
+			log_blk.clear();
+			log_blk.m_solid_color_flag_hdr = true;
+			log_blk.m_solid_color[0] = rh;
+			log_blk.m_solid_color[1] = gh;
+			log_blk.m_solid_color[2] = bh;
+			log_blk.m_solid_color[3] = basist::float_to_half(1.0f);
+
+			bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk);
+			if (!status)
+				return false;
+
+			cur_bx++;
+			if (cur_bx == num_blocks_x)
+			{
+				cur_bx = 0;
+				cur_by++;
+			}
+			
+			break;
+		}
+		case encoding_type::cReuse:
+		{
+			if (!cur_bx && !cur_by)
+				return false;
+
+			const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS);
+
+			const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
+			const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;
+
+			const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y;
+			if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x))
+				return false;
+			if (prev_by < 0)
+				return false;
+			
+			const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
+			const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);
+
+			if (prev_log_blk.m_solid_color_flag_hdr)
+				return false;
+
+			astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
+			astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
+			
+			log_blk = prev_log_blk;
+
+			const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1);
+
+			bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights);
+			if (!status)
+				return false;
+
+			astc_helpers::log_astc_block decomp_blk;
+			status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H);
+			if (!status)
+				return false;
+			
+			uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
+			basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range);
+
+			copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk);
+
+			status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
+			if (!status)
+				return false;
+
+			cur_bx++;
+			if (cur_bx == num_blocks_x)
+			{
+				cur_bx = 0;
+				cur_by++;
+			}
+
+			break;
+		}
+		case encoding_type::cBlock:
+		{
+			const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes);
+			const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal);
+
+			switch (em)
+			{
+			case endpoint_mode::cUseLeft:
+			case endpoint_mode::cUseUpper:
+			{
+				int neighbor_bx = cur_bx, neighbor_by = cur_by;
+				
+				if (em == endpoint_mode::cUseLeft)
+					neighbor_bx--;
+				else
+					neighbor_by--;
+
+				if ((neighbor_bx < 0) || (neighbor_by < 0))
+					return false;
+
+				const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
+				if (!neighbor_blk.m_color_endpoint_modes[0])
+					return false;
+
+				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
+				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
+
+				if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
+					return false;
+
+				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
+				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
+
+				log_blk.clear();
+				log_blk.m_num_partitions = 1;
+				log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
+				log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range;
+				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
+				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
+				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
+				log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
+				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
+
+				memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values);
+
+				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
+
+				bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
+				if (!status)
+					return false;
+
+				astc_helpers::log_astc_block decomp_blk;
+				decomp_blk.clear();
+
+				decomp_blk.m_num_partitions = 1;
+				decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
+				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
+				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
+				decomp_blk.m_dual_plane = bmd.m_dp;
+				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
+
+				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
+
+				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
+				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
+
+				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
+
+				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
+				if (!status)
+					return false;
+
+				cur_bx++;
+				if (cur_bx == num_blocks_x)
+				{
+					cur_bx = 0;
+					cur_by++;
+				}
+
+				break;
+			}
+			case endpoint_mode::cUseLeftDelta:
+			case endpoint_mode::cUseUpperDelta:
+			{
+				int neighbor_bx = cur_bx, neighbor_by = cur_by;
+
+				if (em == endpoint_mode::cUseLeftDelta)
+					neighbor_bx--;
+				else
+					neighbor_by--;
+
+				if ((neighbor_bx < 0) || (neighbor_by < 0))
+					return false;
+
+				const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
+				if (!neighbor_blk.m_color_endpoint_modes[0])
+					return false;
+
+				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
+				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
+
+				if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
+					return false;
+
+				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
+				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
+
+				log_blk.clear();
+				log_blk.m_num_partitions = 1;
+				log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
+				log_blk.m_dual_plane = bmd.m_dp;
+				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
+				
+				log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
+				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
+
+				const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
+				const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1;
+
+				const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank;
+				const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE;
+				const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range);
+
+				for (uint32_t i = 0; i < num_endpoint_values; i++)
+				{
+					int cur_val = ise_to_rank[log_blk.m_endpoints[i]];
+					
+					int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit;
+
+					cur_val += delta;
+					if ((cur_val < 0) || (cur_val >= total_endpoint_levels))
+						return false;
+
+					log_blk.m_endpoints[i] = rank_to_ise[cur_val];
+				}
+
+				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
+				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
+				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
+
+				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
+
+				bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
+				if (!status)
+					return false;
+
+				astc_helpers::log_astc_block decomp_blk;
+				decomp_blk.clear();
+
+				decomp_blk.m_num_partitions = 1;
+				decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
+				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
+				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
+				decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp;
+				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
+
+				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
+
+				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
+				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
+
+				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
+
+				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
+				if (!status)
+					return false;
+
+				cur_bx++;
+				if (cur_bx == num_blocks_x)
+				{
+					cur_bx = 0;
+					cur_by++;
+				}
+
+				break;
+			}
+			case endpoint_mode::cRaw:
+			{
+				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
+
+				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
+
+				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
+				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
+
+				log_blk.clear();
+				log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
+				
+				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
+					log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;
+
+				log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
+				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
+
+				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
+				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
+				log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
+				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
+
+				if (bmd.m_num_partitions == 2)
+				{
+					const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2);
+					log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index];
+				}
+				else if (bmd.m_num_partitions == 3)
+				{
+					const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3);
+					log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index];
+				}
+				
+				bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
+				if (!status)
+					return false;
+
+				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
+
+				status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
+				if (!status)
+					return false;
+
+				astc_helpers::log_astc_block decomp_blk;
+				decomp_blk.clear();
+				
+				decomp_blk.m_dual_plane = bmd.m_dp;
+				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
+				decomp_blk.m_partition_id = log_blk.m_partition_id;
+								
+				decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
+				
+				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
+					decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;
+
+				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
+				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
+
+				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
+					basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p);
+
+				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
+				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
+
+				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
+
+				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
+				if (!status)
+					return false;
+
+				cur_bx++;
+				if (cur_bx == num_blocks_x)
+				{
+					cur_bx = 0;
+					cur_by++;
+				}
+
+				break;
+			}
+			default:
+			{
+				assert(0);
+				return false;
+			}
+			}
+
+			break;
+		}
+		default:
+		{
+			assert(0);
+			return false;
+		}
+		}
+	}
+
+	if (decoder.get_bits(16) != 0xA742)
+	{
+		fmt_error_printf("End marker not found!\n");
+		return false;
+	}
+
+	//fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs());
+
+	return true;
+}
+
+static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
+{
+	astc_helpers::log_astc_block log_blk;
+	if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height))
+		return false;
+	
+	basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4];
+	if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16))
+		return false;
+
+	const uint32_t total_block_pixels = block_width * block_height;
+	for (uint32_t p = 0; p < total_block_pixels; p++)
+	{
+		pPixels[p][0] = basist::half_to_float(half_block[p][0]);
+		pPixels[p][1] = basist::half_to_float(half_block[p][1]);
+		pPixels[p][2] = basist::half_to_float(half_block[p][2]);
+		pPixels[p][3] = basist::half_to_float(half_block[p][3]);
+	}
+
+	return true;
+}
+
+static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
+{
+	return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height);
+}
+
+static bool pack_bc6h_image(const imagef &src_img, vector2D<basist::bc6h_block> &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params)
+{
+	const uint32_t width = src_img.get_width();
+	const uint32_t height = src_img.get_height();
+	
+	if (pPacked_bc6h_img)
+		pPacked_bc6h_img->resize(width, height);
+
+	interval_timer tm;
+	double total_enc_time = 0.0f;
+
+	const uint32_t num_blocks_x = src_img.get_block_width(4);
+	const uint32_t num_blocks_y = src_img.get_block_height(4);
+
+	bc6h_blocks.resize(num_blocks_x, num_blocks_y);
+				
+	for (uint32_t by = 0; by < num_blocks_y; by++)
+	{
+		for (uint32_t bx = 0; bx < num_blocks_x; bx++)
+		{
+			// Extract source image block
+			vec4F block_pixels[4][4]; // [y][x]
+			src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4);
+
+			basist::half_float half_pixels[16 * 3]; // [y][x]
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					for (uint32_t c = 0; c < 3; c++)
+					{
+						float v = block_pixels[y][x][c];
+
+						basist::half_float h = basist::float_to_half(v);
+
+						half_pixels[(x + y * 4) * 3 + c] = h;
+
+					} // c
+
+				} // x
+			} // y
+
+			basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by);
+
+			tm.start();
+
+			basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params);
+
+			total_enc_time += tm.get_elapsed_secs();
+
+			if (pPacked_bc6h_img)
+			{
+				basist::half_float unpacked_blk[16 * 3];
+				bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false);
+				assert(status);
+				if (!status)
+				{
+					fmt_error_printf("unpack_bc6h() failed\n");
+					return false;
+				}
+							
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						vec4F p;
+
+						for (uint32_t c = 0; c < 3; c++)
+						{
+							float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]);
+							p[c] = v;
+
+						} // c
+
+						p[3] = 1.0f;
+
+						pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p);
+					} // x
+				} // y
+			}
+
+		} // bx
+	} // by
+
+	//fmt_printf("Total BC6H encode time: {}\n", total_enc_time);
+
+	return true;
+}
+
+static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir)
+{
+	vec3F q(p - line_org);
+	vec3F v(q - q.dot(line_dir) * line_dir);
+	return v.dot(v);
+}
+
+static void estimate_partitions_mode7_and_11(
+	uint32_t num_parts, // 2 or 3 partitions
+	uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
+	uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
+	const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats
+	const astc_hdr_codec_base_options& coptions, // options
+	uint32_t num_desired_pats, 
+	int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices
+{
+	BASISU_NOTE_UNUSED(coptions);
+	BASISU_NOTE_UNUSED(num_unique_pats);
+
+	const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 6
+	assert(num_parts <= MAX_PARTS);
+
+	struct candidate_res
+	{
+		float m_total_sq_dist;
+		uint32_t m_index;
+		bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
+	};
+
+	const uint32_t MAX_CANDIDATES = 1024;
+	assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));
+
+	candidate_res mode11_candidates[MAX_CANDIDATES];
+	candidate_res mode7_candidates[MAX_CANDIDATES];
+
+	const vec3F grayscale_axis(0.5773502691f);
+	
+	for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
+	{
+		const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
+		assert(unique_part_index < num_unique_pats);
+
+		const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];
+
+		vec3F part_means[MAX_PARTS];
+		uint32_t part_total_texels[MAX_PARTS] = { 0 };
+
+		for (uint32_t i = 0; i < num_parts; i++)
+			part_means[i].clear();
+
+		for (uint32_t y = 0; y < BLOCK_H; y++)
+		{
+			for (uint32_t x = 0; x < BLOCK_W; x++)
+			{
+				const uint32_t part_index = (*pPat)(x, y);
+				assert(part_index < num_parts);
+
+				part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
+				part_total_texels[part_index]++;
+
+ 			} // x
+		} // y
+		
+		for (uint32_t i = 0; i < num_parts; i++)
+		{
+			assert(part_total_texels[i]);
+			part_means[i] /= (float)part_total_texels[i];
+		}
+
+		float part_cov[MAX_PARTS][6];
+		memset(part_cov, 0, sizeof(part_cov));
+
+		for (uint32_t y = 0; y < BLOCK_H; y++)
+		{
+			for (uint32_t x = 0; x < BLOCK_W; x++)
+			{
+				const uint32_t part_index = (*pPat)(x, y);
+				assert(part_index < num_parts);
+
+				const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]);
+
+				const float r = p[0], g = p[1], b = p[2];
+
+				part_cov[part_index][0] += r * r;
+				part_cov[part_index][1] += r * g;
+				part_cov[part_index][2] += r * b;
+				part_cov[part_index][3] += g * g;
+				part_cov[part_index][4] += g * b;
+				part_cov[part_index][5] += b * b;
+
+			} // x
+		} // y
+
+		// For each partition compute the total variance of all channels.
+		float total_variance[MAX_PARTS];
+		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
+			total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5];
+
+		vec3F part_axis[MAX_PARTS];
+		float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
+		float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
+
+		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
+		{
+			float* pCov = &part_cov[part_index][0];
+
+			float xr = .9f, xg = 1.0f, xb = .7f;
+			
+			const uint32_t NUM_POWER_ITERS = 4;
+			for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++)
+			{
+				float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
+				float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
+				float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];
+
+				float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
+
+				if (m >= 1e-10f)
+				{
+					m = 1.0f / m;
+					
+					r *= m;
+					g *= m;
+					b *= m;
+				}
+
+				xr = r;
+				xg = g;
+				xb = b;
+			}
+
+			float len_sq = xr * xr + xg * xg + xb * xb;
+						
+			if (len_sq < 1e-10f)
+			{
+				xr = grayscale_axis[0];
+				xg = grayscale_axis[0];
+				xb = grayscale_axis[0];
+			}
+			else
+			{
+				len_sq = 1.0f / sqrtf(len_sq);
+
+				xr *= len_sq;
+				xg *= len_sq;
+				xb *= len_sq;
+			}
+			
+			{
+				// Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis).
+				float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
+				float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
+				float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];
+
+				// Estimate the principle eigenvalue by computing the magnitude of the transformed vector.
+				// The result is the variance along the principle axis.
+				//float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis
+				//float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb
+				
+				mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb;
+			}
+
+			{
+				const float yrgb = grayscale_axis[0];
+				
+				// Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector).
+				float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2];
+				float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4];
+				float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5];
+
+				mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb;
+			}
+
+		} // part_index
+				
+		// Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis.
+		// TODO: Could also compute the ratio of the principle axis's variance vs. the total variance.
+		float mode11_total_sq_dist_to_line_alt = 0.0f;
+		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
+		{
+			float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]);
+			mode11_total_sq_dist_to_line_alt += d;
+		}
+
+		{
+#if 0
+			// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
+			// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
+			float total_sq_dist_to_line = 0.0f;
+			for (uint32_t i = 0; i < BLOCK_T; i++)
+			{
+				const uint32_t part_index = (*pPat)[i];
+				assert(part_index < num_parts);
+
+				total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]);
+			}
+
+			mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
+#else
+			mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt;
+#endif
+			mode11_candidates[examine_iter].m_index = unique_part_index;
+		}
+
+		{
+			float mode7_total_sq_dist_to_line_alt = 0.0f;
+			for (uint32_t part_index = 0; part_index < num_parts; part_index++)
+			{
+				float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]);
+				mode7_total_sq_dist_to_line_alt += d;
+			}
+
+			mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt;
+			mode7_candidates[examine_iter].m_index = unique_part_index;
+		}
+
+	} // examine_iter
+
+	std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]);
+	std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]);
+
+	for (uint32_t i = 0; i < num_desired_pats; i++)
+		pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index;
+
+	for (uint32_t i = 0; i < num_desired_pats; i++)
+		pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index;
+}
+
+static void estimate_partitions_mode7(
+	uint32_t num_parts, // 2 or 3 partitions
+	uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
+	uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
+	const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats
+	const astc_hdr_codec_base_options& coptions, // options
+	uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices
+{
+	BASISU_NOTE_UNUSED(coptions);
+	BASISU_NOTE_UNUSED(num_unique_pats);
+
+	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3;
+	assert(num_parts <= MAX_PARTS);
+
+	struct candidate_res
+	{
+		float m_total_sq_dist;
+		uint32_t m_index;
+		bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
+	};
+
+	const uint32_t MAX_CANDIDATES = 1024;
+	assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));
+
+	candidate_res candidates[MAX_CANDIDATES];
+
+	for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
+	{
+		const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
+		assert(unique_part_index < num_unique_pats);
+
+		const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];
+
+		vec3F part_means[MAX_PARTS];
+		uint32_t part_total_texels[MAX_PARTS] = { 0 };
+
+		for (uint32_t i = 0; i < num_parts; i++)
+			part_means[i].clear();
+
+		for (uint32_t y = 0; y < BLOCK_H; y++)
+		{
+			for (uint32_t x = 0; x < BLOCK_W; x++)
+			{
+				const uint32_t part_index = (*pPat)(x, y);
+				assert(part_index < num_parts);
+
+				part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
+				part_total_texels[part_index]++;
+
+			} // x
+		} // y
+
+		for (uint32_t i = 0; i < num_parts; i++)
+		{
+			assert(part_total_texels[i]);
+			part_means[i] /= (float)part_total_texels[i];
+		}
+
+		vec3F part_axis(0.5773502691f);
+		
+		// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
+		// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
+		float total_sq_dist_to_line = 0.0f;
+		for (uint32_t i = 0; i < BLOCK_T; i++)
+		{
+			const uint32_t part_index = (*pPat)[i];
+			assert(part_index < num_parts);
+
+			total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis);
+		}
+
+		candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
+
+		candidates[examine_iter].m_index = unique_part_index;
+
+	} // examine_iter
+
+	std::sort(&candidates[0], &candidates[num_pats_to_examine]);
+
+	for (uint32_t i = 0; i < num_desired_pats; i++)
+		pDesired_pat_indices[i] = candidates[i].m_index;
+}
+
+static float calc_deblocking_penalty_itp(
+	uint32_t bx, uint32_t by, uint32_t width, uint32_t height,
+	const imagef& pass_src_img_itp, const candidate_encoding& candidate)
+{
+	float total_deblock_penalty = 0.0f;
+
+	float total_orig_mse = 0.0f, total_comp_mse = 0.0f;
+	uint32_t total_c = 0;
+
+	for (uint32_t b = 0; b < 4; b++)
+	{
+		for (uint32_t i = 0; i < 6; i++)
+		{
+			int ox = 0, oy = 0, qx = 0, qy = 0;
+
+			switch (b)
+			{
+			case 0:
+				ox = bx * 6 + i; oy = (by - 1) * 6 + 5;
+				qx = bx * 6 + i; qy = by * 6;
+				break;
+			case 1:
+				ox = bx * 6 + i; oy = (by + 1) * 6;
+				qx = bx * 6 + i; qy = by * 6 + 5;
+				break;
+			case 2:
+				ox = (bx - 1) * 6 + 5; oy = by * 6 + i;
+				qx = bx * 6; qy = by * 6 + i;
+				break;
+			case 3:
+				ox = (bx + 1) * 6; oy = by * 6 + i;
+				qx = bx * 6 + 5; qy = by * 6 + i;
+				break;
+			}
+
+			if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height))
+				continue;
+
+			const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy);
+			const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy);
+
+			const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block
+			
+			vec3F orig_delta_v(o_pixel_itp - q_pixel_itp);
+			total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]);
+
+			vec3F d_delta_v(o_pixel_itp - d_pixel_itp);
+			total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]);
+
+			total_c++;
+		}
+	}
+
+	if (total_c)
+	{
+		total_orig_mse /= (float)total_c;
+		total_comp_mse /= (float)total_c;
+
+		if (total_orig_mse)
+		{
+			total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse);
+		}
+	}
+
+	return total_deblock_penalty;
+}
+
+static bool calc_strip_size(
+	float lambda,
+	uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip,
+	uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg)
+{
+	uint32_t total_strips = 1;
+
+	if (lambda == 0.0f)
+	{
+		if (!force_one_strip)
+		{
+			total_strips = total_threads;
+		}
+	}
+	else
+	{
+		const uint32_t MIN_DESIRED_STRIPS = 8;
+		const uint32_t MAX_TARGET_STRIPS = 32;
+		const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12;
+
+		if (!force_one_strip)
+		{
+			total_strips = maximum<uint32_t>(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP);
+
+			if (num_blocks_y >= MIN_DESIRED_STRIPS * 2)
+				total_strips = maximum(total_strips, MIN_DESIRED_STRIPS);
+		}
+
+		total_strips = minimum(total_strips, MAX_TARGET_STRIPS);
+	}
+
+	uint32_t rows_per_strip = 0;
+	if (total_strips <= 1)
+	{
+		rows_per_strip = num_blocks_y;
+	}
+	else
+	{
+		rows_per_strip = (num_blocks_y / total_strips) & ~1;
+		
+		if (rows_per_strip < 2)
+			rows_per_strip = 2;// num_blocks_y;
+	}
+		
+	assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0));
+
+	total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip;
+	
+	if (global_cfg.m_debug_output)
+	{
+		fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips);
+		fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip);
+		fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip);
+	}
+
+	uint32_t total_rows = 0;
+	for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
+	{
+		uint32_t strip_first_by = strip_index * rows_per_strip;
+		uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
+
+		if (strip_index == (total_strips - 1))
+			strip_last_by = num_blocks_y - 1;
+
+		uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1;
+		total_rows += num_strip_block_rows;
+
+		if (global_cfg.m_debug_output)
+			fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows);
+	}
+
+	if (total_rows != num_blocks_y)
+	{
+		fmt_error_printf("Strip calc failed\n");
+		return false;
+	}
+
+	res_total_strips = total_strips;
+	res_rows_per_strip = rows_per_strip;
+
+	return true;
+}
+
+static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg)
+{
+	const uint32_t width = src_img.get_width(), height = src_img.get_height();
+
+	dst_img.resize(width, height);
+
+	for (uint32_t y = 0; y < height; y++)
+	{
+		for (uint32_t x = 0; x < width; x++)
+		{
+			vec3F src_rgb(src_img(x, y));
+
+			vec3F src_itp;
+			linear_rgb_to_itp(src_rgb, src_itp, cfg);
+
+			dst_img(x, y) = src_itp;
+		}
+	}
+}
+
+const uint32_t BLOCK_W = 6, BLOCK_H = 6;
+const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H;
+
+const float SOLID_PENALTY = 4.0f;
+const float REUSE_PENALTY = 1.0f;
+const float RUN_PENALTY = 10.0f;
+
+const float MSE_WEIGHT = 300000.0f;
+const float SSIM_WEIGHT = 200.0f;
+const float TWO_LEVEL_PENALTY = 1.425f;
+const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f;
+const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f;
+const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f;
+const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f;
+const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f;
+
+struct uastc_hdr_6x6_debug_state
+{
+	uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 };
+	uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 };
+	uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 };
+	uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 };
+
+	basisu::vector< basisu::stats<float> > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3];
+	basisu::vector< basisu::comparative_stats<float> > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3];
+
+	std::atomic<uint32_t> m_total_gaussian1_blocks;
+	std::atomic<uint32_t> m_total_gaussian2_blocks;
+	std::atomic<uint32_t> m_total_filter_horizontal;
+	std::atomic<uint32_t> m_detail_stats[5];
+	std::atomic<uint32_t> m_total_mode7_skips;
+
+	std::atomic<uint32_t> m_total_blocks_compressed;
+
+	std::atomic<uint32_t> m_total_candidates_considered;
+	std::atomic<uint32_t> m_max_candidates_considered;
+
+	std::atomic<uint32_t> m_total_part2_stats[4];
+	std::atomic<uint32_t> m_dp_stats[5];
+
+	std::atomic<uint32_t> m_reuse_num_parts[4];
+	std::atomic<uint32_t> m_reuse_total_dp;
+
+	imagef m_stat_vis;
+	std::mutex m_stat_vis_mutex;
+
+	image m_part_vis;
+	image m_mode_vis;
+	image m_mode_vis2;
+	image m_grid_vis;
+	image m_enc_vis;
+	std::mutex m_vis_image_mutex;
+
+	std::atomic<uint32_t> m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1];
+		
+	std::atomic<uint32_t> m_total_jnd_replacements;
+
+	std::mutex m_stats_mutex;
+
+	uastc_hdr_6x6_debug_state()
+	{
+		for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
+		{
+			for (uint32_t j = 0; j < 3; j++)
+			{
+				m_block_mode_comp_stats[i][j].reserve(512);
+				m_block_mode_comparative_stats[i][j].reserve(512);
+			}
+		}
+	}
+	
+	void init(uint32_t width, uint32_t height)
+	{
+		m_stat_vis.resize(width, height);
+		m_part_vis.resize(width, height);
+		m_mode_vis.resize(width, height);
+		m_mode_vis2.resize(width, height);
+		m_grid_vis.resize(width, height);
+		m_enc_vis.resize(width, height);
+
+		basisu::clear_obj(m_encoding_type_hist);
+		basisu::clear_obj(m_endpoint_mode_hist);
+		basisu::clear_obj(m_block_mode_hist);
+		basisu::clear_obj(m_block_mode_total_bits);
+		
+		for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
+		{
+			for (uint32_t j = 0; j < 3; j++)
+			{
+				m_block_mode_comp_stats[i][j].clear();
+				m_block_mode_comparative_stats[i][j].clear();
+			}
+		}
+
+		m_total_gaussian1_blocks.store(0);
+		m_total_gaussian2_blocks.store(0);
+		m_total_filter_horizontal.store(0);
+		for (uint32_t i = 0; i < std::size(m_detail_stats); i++)
+			m_detail_stats[i].store(0);
+		m_total_mode7_skips.store(0);
+
+		for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++)
+			m_comp_level_hist[i].store(0);
+
+		m_total_blocks_compressed.store(0);
+
+		m_total_candidates_considered.store(0);
+		m_max_candidates_considered.store(0);
+
+		for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++)
+			m_total_part2_stats[i].store(0);
+		
+		for (uint32_t i = 0; i < std::size(m_dp_stats); i++)
+			m_dp_stats[i].store(0);
+
+		for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++)
+			m_reuse_num_parts[i] .store(0);
+
+		m_reuse_total_dp.store(0);
+
+		m_total_jnd_replacements.store(0);
+	}
+
+	void print(uint32_t total_blocks) const
+	{
+		fmt_printf("Total blocks: {}\n", total_blocks);
+		fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks);
+		fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]);
+		fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks);
+		fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks);
+		fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks);
+		fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]);
+		fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips);
+
+		fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks);
+		fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered);
+
+		fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]);
+		fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]);
+		fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp);
+		fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]);
+
+		fmt_printf("\nEncoding type histogram:\n");
+		for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++)
+			fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]);
+
+		fmt_printf("\nEndpoint mode histogram:\n");
+		for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++)
+			fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]);
+
+		fmt_printf("\nBlock mode histogram:\n");
+
+		uint32_t total_dp = 0, total_sp = 0;
+		uint32_t total_mode11 = 0, total_mode7 = 0;
+		uint32_t part_hist[3] = { 0 };
+		uint32_t part2_mode7_total = 0, part2_mode11_total = 0;
+		uint32_t total_used_modes = 0;
+		for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++)
+		{
+			const auto& bm_desc = g_block_mode_descs[i];
+
+			const uint32_t total_uses = m_block_mode_hist[i];
+
+			if (bm_desc.m_dp)
+				total_dp += total_uses;
+			else
+				total_sp += total_uses;
+
+			if (bm_desc.m_cem == 7)
+				total_mode7 += total_uses;
+			else
+				total_mode11 += total_uses;
+
+			part_hist[bm_desc.m_num_partitions - 1] += total_uses;
+
+			if (bm_desc.m_num_partitions == 2)
+			{
+				if (bm_desc.m_cem == 7)
+					part2_mode7_total += total_uses;
+				else
+				{
+					assert(bm_desc.m_cem == 11);
+					part2_mode11_total += total_uses;
+				}
+			}
+
+			float avg_std_dev = 0.0f;
+			float avg_cross_correlations[3] = { 0 };
+
+			if (m_block_mode_comp_stats[i][0].size())
+			{
+				const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32();
+
+				for (uint32_t j = 0; j < num_uses; j++)
+					avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev);
+				avg_std_dev /= (float)num_uses;
+
+				for (uint32_t j = 0; j < num_uses; j++)
+				{
+					avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson);
+					avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson);
+					avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson);
+				}
+
+				avg_cross_correlations[0] /= (float)num_uses;
+				avg_cross_correlations[1] /= (float)num_uses;
+				avg_cross_correlations[2] /= (float)num_uses;
+			}
+
+			fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses,
+				bm_desc.m_cem,
+				bm_desc.m_dp, bm_desc.m_dp_channel,
+				bm_desc.m_num_partitions,
+				bm_desc.m_grid_x, bm_desc.m_grid_y,
+				astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range),
+				astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range),
+				total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f,
+				avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]);
+
+			if (total_uses)
+				total_used_modes++;
+		}
+
+		fmt_printf("Total used modes: {}\n", total_used_modes);
+
+		fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp);
+		fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7);
+		fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]);
+		fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total);
+	}
+};
+
+struct uastc_hdr_6x6_encode_state
+{
+	astc_hdr_codec_base_options master_coptions;
+		
+	imagef src_img;
+		
+	imagef src_img_filtered1;
+	imagef src_img_filtered2;
+
+	imagef src_img_itp;
+	imagef src_img_filtered1_itp;
+	imagef src_img_filtered2_itp;
+
+	vector2D<float> smooth_block_mse_scales;
+
+	imagef packed_img;
+
+	basisu::vector<bitwise_coder> strip_bits;
+
+	basisu::vector2D<astc_helpers::astc_block> final_astc_blocks;
+
+	vector2D<candidate_encoding> coded_blocks;
+};
+
+static bool compress_strip_task(
+	uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by,
+	uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height,
+	astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state)
+{
+	BASISU_NOTE_UNUSED(num_blocks_y);
+	BASISU_NOTE_UNUSED(total_strips);
+	
+	vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x]
+	basisu::clear_obj(prev_comp_pixels);
+
+	uint32_t prev_run_len = 0;
+
+	bitwise_coder prev_encoding;
+	candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension
+	candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written
+
+	bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index];
+
+	const uint32_t CANDIDATES_TO_RESERVE = 1536;
+
+	basisu::vector<candidate_encoding> candidates;
+	candidates.reserve(CANDIDATES_TO_RESERVE);
+
+	for (uint32_t by = strip_first_by; by <= strip_last_by; by++)
+	{
+		const bool has_upper_neighbor = by > strip_first_by;
+
+		for (uint32_t bx = 0; bx < num_blocks_x; bx++)
+		{
+			//if ((bx == 1) && (by == 2))
+			//	basisu::fmt_printf("!");
+
+			for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++)
+			{
+				const bool has_left_neighbor = bx > 0;
+				//const bool has_prev = has_left_neighbor || has_upper_neighbor;
+
+				// Select either the original source image, or the Gaussian filtered version.
+				// From here the encoder *must* use these 2 sources.
+				const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 :
+					((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img);
+
+				const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp :
+					((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp);
+
+				// Extract source image block
+				vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x]
+				pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
+
+				vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x]
+				pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
+
+				half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values
+				vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats
+				vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding
+				vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations
+
+				bool is_grayscale = true;
+
+				candidates.resize(0);
+
+				float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f;
+
+				for (uint32_t y = 0; y < BLOCK_H; y++)
+				{
+					for (uint32_t x = 0; x < BLOCK_W; x++)
+					{
+						vec3F rgb_input;
+
+						for (uint32_t c = 0; c < 3; c++)
+						{
+							float v = block_pixels[y][x][c];
+
+							rgb_input[c] = v;
+
+							const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v);
+							assert(h == basist::float_to_half(v));
+
+							half_pixels[y][x][c] = h;
+
+							block_pixels_q16[y][x][c] = (float)half_to_qlog16(h);
+
+							half_pixels_as_floats[y][x][c] = (float)h;
+
+						} // c
+
+						float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B));
+						if (py < block_ly)
+							block_ly = py;
+						if (py > block_hy)
+							block_hy = py;
+						block_avg_y += py;
+
+						//linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]);
+
+						block_pixels_as_itp[y][x] = block_pixels_itp[y][x];
+
+						block_pixels_q16[y][x][3] = 0.0f;
+
+						if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2]))
+							is_grayscale = false;
+
+					} // x
+				} // y
+
+				block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS);
+
+				encode_astc_block_stats enc_block_stats;
+				enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]);
+
+				vec4F x_filtered[6][6], y_filtered[6][6];
+
+				filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal)
+				filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically)
+
+				const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered);
+				const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered);
+				const bool filter_horizontally = filtered_x_err < filtered_y_err;
+
+				//const float block_mag_gradient_mag = block_max_gradient_mag(bx, by);
+
+				if (filter_horizontally)
+					debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed);
+
+				vec3F lowpass_filtered[6][6];
+				filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]);
+				float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]);
+
+				const bool very_detailed_block = lowpass_std_dev > 350.0f;
+				const bool very_blurry_block = lowpass_std_dev < 30.0f;
+				const bool super_blurry_block = lowpass_std_dev < 15.0f;
+
+				basisu::stats<float> half_comp_stats[3];
+				for (uint32_t c = 0; c < 3; c++)
+					half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3);
+
+				const float SINGLE_PART_HALF_THRESH = 256.0f;
+				const float COMPLEX_HALF_THRESH = 1024.0f;
+				// HACK HACK
+				const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f;
+
+				const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev);
+
+				const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH);
+				const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH);
+				const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH);
+
+				// Dynamically choose a comp_level for this block.
+				astc_hdr_codec_base_options coptions(enc_state.master_coptions);
+				uint32_t comp_level = global_cfg.m_master_comp_level;
+
+				if (very_complex_block)
+					comp_level = global_cfg.m_highest_comp_level;
+				else if (complex_block)
+					comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2;
+
+				debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed);
+
+				bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false;
+				BASISU_NOTE_UNUSED(any_2subset_mode11_enabled);
+
+				for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
+				{
+					if (comp_level == 0)
+					{
+						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
+							continue;
+					}
+					else if (comp_level == 1)
+					{
+						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
+							continue;
+					}
+					else if (comp_level == 2)
+					{
+						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
+							continue;
+					}
+
+					if (g_block_mode_descs[i].m_num_partitions == 2)
+					{
+						any_2subset_enabled = true;
+
+						if (g_block_mode_descs[i].m_cem == 7)
+						{
+							any_2subset_mode7_enabled = true;
+						}
+						else
+						{
+							assert(g_block_mode_descs[i].m_cem == 11);
+							any_2subset_mode11_enabled = true;
+						}
+					}
+					else if (g_block_mode_descs[i].m_num_partitions == 3)
+						any_3subset_enabled = true;
+				}
+
+				coptions.m_mode7_full_s_optimization = (comp_level >= 2);
+
+				const bool uber_mode_flag = (comp_level >= 3);
+				coptions.m_allow_uber_mode = uber_mode_flag;
+
+				coptions.m_ultra_quant = (comp_level >= 4);
+
+				coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2);
+				coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2);
+
+				coptions.m_disable_weight_plane_optimization = (comp_level >= 2);
+
+				// -------------------
+
+				uint32_t total_used_block_chans = 0;
+				for (uint32_t i = 0; i < 3; i++)
+					total_used_block_chans += (half_comp_stats[i].m_range > 0.0f);
+
+				const bool is_solid_block = (total_used_block_chans == 0);
+
+				basisu::comparative_stats<float> half_cross_chan_stats[3];
+
+				// R vs. G
+				half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS,
+					&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1],
+					3, 3,
+					&half_comp_stats[0], &half_comp_stats[1]);
+
+				// R vs. B
+				half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS,
+					&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2],
+					3, 3,
+					&half_comp_stats[0], &half_comp_stats[2]);
+
+				// G vs. B
+				half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS,
+					&half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2],
+					3, 3,
+					&half_comp_stats[1], &half_comp_stats[2]);
+
+				const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson);
+				const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson);
+				const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson);
+
+				float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL;
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					if (half_comp_stats[i].m_range > 0.0f)
+					{
+						const float c = fabsf((float)half_cross_chan_stats[i].m_pearson);
+						min_corr = minimum(min_corr, c);
+						max_corr = maximum(max_corr, c);
+					}
+				}
+
+				bool use_single_subset_mode7 = true;
+				if (comp_level <= 1)
+				{
+					// TODO: could also compute angle between principle axis and the grayscale axis.
+					// TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance
+					const float MODE7_MIN_CHAN_CORR = .5f;
+					const float MODE7_PCA_ANGLE_THRESH = .9f;
+					use_single_subset_mode7 = is_grayscale || is_solid_block || (min_corr >= MODE7_MIN_CHAN_CORR);
+
+					if (use_single_subset_mode7)
+					{
+						float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f)));
+						if (cos_ang < MODE7_PCA_ANGLE_THRESH)
+							use_single_subset_mode7 = false;
+					}
+				}
+
+				const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f);
+
+				int desired_dp_chan = -1;
+				if (total_used_block_chans <= 1)
+				{
+					// no need for dual plane (except possibly 2x2 weight grids for RDO)
+				}
+				else
+				{
+					if (min_corr >= STRONG_CORR_THRESH)
+					{
+						// all channel pairs strongly correlated, no need for dual plane
+						debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed);
+					}
+					else
+					{
+						if (total_used_block_chans == 2)
+						{
+							if (half_comp_stats[0].m_range == 0.0f)
+							{
+								// r unused, check for strong gb correlation
+								if (gb_corr < STRONG_CORR_THRESH)
+									desired_dp_chan = 1;
+							}
+							else if (half_comp_stats[1].m_range == 0.0f)
+							{
+								// g unused, check for strong rb correlation
+								if (rb_corr < STRONG_CORR_THRESH)
+									desired_dp_chan = 0;
+							}
+							else
+							{
+								// b unused, check for strong rg correlation
+								if (rg_corr < STRONG_CORR_THRESH)
+									desired_dp_chan = 0;
+							}
+						}
+						else
+						{
+							assert(total_used_block_chans == 3);
+
+							// see if rg/rb is weakly correlated vs. gb
+							if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
+								desired_dp_chan = 0;
+							// see if gr/gb is weakly correlated vs. rb
+							else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
+								desired_dp_chan = 1;
+							// assume b is weakest
+							else
+								desired_dp_chan = 2;
+						}
+
+						if (desired_dp_chan == -1)
+							debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed);
+						else
+							debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed);
+					}
+				}
+
+				// 2x2 is special for RDO at higher lambdas - always pick a preferred channel.
+				int desired_dp_chan_2x2 = 0;
+				if (total_used_block_chans == 2)
+				{
+					if (half_comp_stats[0].m_range == 0.0f)
+						desired_dp_chan_2x2 = 1;
+				}
+				else if (total_used_block_chans == 3)
+				{
+					// see if rg/rb is weakly correlated vs. gb
+					if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
+						desired_dp_chan_2x2 = 0;
+					// see if gr/gb is weakly correlated vs. rb
+					else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
+						desired_dp_chan_2x2 = 1;
+					// assume b is weakest
+					else
+						desired_dp_chan_2x2 = 2;
+				}
+
+				// Gather all candidate encodings
+				bool status = false;
+
+				// ---- Run candidate
+				if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor))
+				{
+					candidate_encoding candidate;
+					candidate.m_coder.reserve(24);
+
+					candidate.m_encoding_type = encoding_type::cRun;
+
+					candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk;
+					candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk;
+
+					memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels));
+
+					if (!prev_run_len)
+					{
+						candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
+						candidate.m_coder.put_vlc(0, 5);
+					}
+					else
+					{
+						// extend current run - compute the # of new bits needed for the extension.
+
+						uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
+						assert(prev_run_bits > 0);
+
+						// We're not actually going to code this, because the previously emitted run code will be extended.
+						bitwise_coder temp_coder;
+						temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
+						temp_coder.put_vlc((prev_run_len + 1) - 1, 5);
+
+						uint32_t cur_run_bits = temp_coder.get_total_bits_u32();
+						assert(cur_run_bits >= prev_run_bits);
+
+						uint32_t total_new_bits = cur_run_bits - prev_run_bits;
+						if (total_new_bits > 0)
+							candidate.m_coder.put_bits(0, total_new_bits); // dummy bits
+					}
+
+					candidate.m_run_len = prev_run_len + 1;
+
+					candidates.emplace_back(std::move(candidate));
+				}
+
+				// ---- Reuse candidate
+				if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f))
+				{
+					for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++)
+					{
+						const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
+						const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;
+
+						const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y;
+						if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x))
+							continue;
+						if (reuse_by < (int)strip_first_by)
+							break;
+
+						const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by);
+
+						// TODO - support this.
+						if (prev_candidate.m_encoding_type == encoding_type::cSolid)
+							continue;
+						assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse));
+
+						candidate_encoding candidate;
+						candidate.m_coder.reserve(24);
+						astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
+						astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk;
+
+						const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk;
+
+						const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height;
+						const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane;
+						const uint32_t num_grid_samples = grid_x * grid_y;
+						const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]);
+
+						coded_log_blk = prev_candidate.m_coded_log_blk;
+						decomp_log_blk = prev_candidate.m_decomp_log_blk;
+
+						if (prev_coded_log_blk.m_num_partitions == 1)
+						{
+							// Now encode the block using the transcoded endpoints
+							basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+
+							if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
+							{
+								status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
+									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
+							}
+							else
+							{
+								status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
+									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
+							}
+							assert(status);
+
+							uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
+							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
+
+							if (dual_plane)
+							{
+								eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector,
+									BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
+
+								downsample_ise_weights_dual_plane(
+									coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
+									BLOCK_W, BLOCK_H,
+									grid_x, grid_y,
+									trial_weights0, trial_weights1, coded_log_blk.m_weights);
+
+								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
+							}
+							else
+							{
+								eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
+
+								downsample_ise_weights(
+									coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
+									BLOCK_W, BLOCK_H,
+									grid_x, grid_y,
+									trial_weights0, coded_log_blk.m_weights);
+
+								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
+							}
+
+							// Create the block the decoder would transcode into.
+							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
+						}
+						else if (prev_coded_log_blk.m_num_partitions == 2)
+						{
+							assert(!dual_plane);
+
+							const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id];
+							assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2));
+
+							const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index];
+
+							vec4F part_pixels_q16[2][64];
+							half_vec3 part_half_pixels[2][64];
+							uint32_t part_total_pixels[2] = { 0 };
+
+							for (uint32_t y = 0; y < BLOCK_H; y++)
+							{
+								for (uint32_t x = 0; x < BLOCK_W; x++)
+								{
+									const uint32_t part_index = pat_vec[x + y * 6];
+
+									uint32_t l = part_total_pixels[part_index];
+
+									part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
+									part_half_pixels[part_index][l] = half_pixels[y][x];
+
+									part_total_pixels[part_index] = l + 1;
+								} // x 
+							} // y
+
+							uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
+
+							for (uint32_t part_index = 0; part_index < 2; part_index++)
+							{
+								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+
+								if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
+								{
+									status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
+										astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
+								}
+								else
+								{
+									status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
+										astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
+								}
+								assert(status);
+
+								eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
+									(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
+
+							} // part_index
+
+							uint8_t ise_weights[BLOCK_W * BLOCK_H];
+
+							uint32_t src_pixel_index[2] = { 0, 0 };
+							for (uint32_t y = 0; y < BLOCK_H; y++)
+							{
+								for (uint32_t x = 0; x < BLOCK_W; x++)
+								{
+									const uint32_t part_index = pat_vec[x + y * 6];
+
+									ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
+									src_pixel_index[part_index]++;
+								} // x
+							} // y
+
+							downsample_ise_weights(
+								coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
+								BLOCK_W, BLOCK_H,
+								grid_x, grid_y,
+								ise_weights, coded_log_blk.m_weights);
+
+							// Transcode these codable weights to ASTC weights.
+							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
+							basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
+
+							// Create the block the decoder would transcode into.
+							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
+						}
+						else if (prev_coded_log_blk.m_num_partitions == 3)
+						{
+							assert(!dual_plane);
+
+							const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id];
+							assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3));
+
+							const partition_pattern_vec& pat = g_partitions3[unique_pat_index];
+
+							vec4F part_pixels_q16[3][64];
+							half_vec3 part_half_pixels[3][64];
+							uint32_t part_total_pixels[3] = { 0 };
+
+							for (uint32_t y = 0; y < BLOCK_H; y++)
+							{
+								for (uint32_t x = 0; x < BLOCK_W; x++)
+								{
+									const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];
+
+									uint32_t l = part_total_pixels[part_index];
+
+									part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
+									part_half_pixels[part_index][l] = half_pixels[y][x];
+
+									part_total_pixels[part_index] = l + 1;
+								} // x 
+							} // y
+
+							uint8_t blk_weights[3][BLOCK_W * BLOCK_H];
+
+							for (uint32_t part_index = 0; part_index < 3; part_index++)
+							{
+								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+
+								status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
+									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
+								assert(status);
+
+								eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
+									(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
+
+							} // part_index
+
+							uint8_t ise_weights[BLOCK_W * BLOCK_H];
+
+							uint32_t src_pixel_index[3] = { 0 };
+							for (uint32_t y = 0; y < BLOCK_H; y++)
+							{
+								for (uint32_t x = 0; x < BLOCK_W; x++)
+								{
+									const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];
+
+									ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
+									src_pixel_index[part_index]++;
+								} // x
+							} // y
+
+							downsample_ise_weights(
+								coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
+								BLOCK_W, BLOCK_H,
+								grid_x, grid_y,
+								ise_weights, coded_log_blk.m_weights);
+
+							// Transcode these codable weights to ASTC weights.
+							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
+							basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
+
+							// Create the block the decoder would transcode into.
+							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
+						}
+
+						if (!validate_log_blk(decomp_log_blk))
+						{
+							fmt_error_printf("pack_astc_block() failed\n");
+							return false;
+						}
+
+						status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]);
+						if (!status)
+						{
+							fmt_error_printf("decode_astc_block() failed\n");
+							return false;
+						}
+
+						candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN);
+						candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS);
+						encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range);
+
+						candidate.m_encoding_type = encoding_type::cReuse;
+						candidate.m_block_mode = prev_candidate.m_block_mode;
+						candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode;
+						candidate.m_reuse_delta_index = reuse_delta_index;
+
+						candidates.emplace_back(std::move(candidate));
+
+					} // reuse_delta_index
+				}
+
+				// ---- Solid candidate
+				if (global_cfg.m_use_solid_blocks)
+				{
+					candidate_encoding candidate;
+					candidate.m_coder.reserve(24);
+
+					// solid
+					candidate.m_encoding_type = encoding_type::cSolid;
+
+					float r = 0.0f, g = 0.0f, b = 0.0f;
+					const float LOG_BIAS = .125f;
+					bool solid_block = true;
+					for (uint32_t y = 0; y < BLOCK_H; y++)
+					{
+						for (uint32_t x = 0; x < BLOCK_W; x++)
+						{
+							if ((block_pixels[0][0][0] != block_pixels[y][x][0]) ||
+								(block_pixels[0][0][1] != block_pixels[y][x][1]) ||
+								(block_pixels[0][0][2] != block_pixels[y][x][2]))
+							{
+								solid_block = false;
+							}
+
+							r += log2f(block_pixels[y][x][0] + LOG_BIAS);
+							g += log2f(block_pixels[y][x][1] + LOG_BIAS);
+							b += log2f(block_pixels[y][x][2] + LOG_BIAS);
+						}
+					}
+
+					if (solid_block)
+					{
+						r = block_pixels[0][0][0];
+						g = block_pixels[0][0][1];
+						b = block_pixels[0][0][2];
+					}
+					else
+					{
+						r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
+						g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
+						b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
+
+						r = minimum<float>(r, basist::MAX_HALF_FLOAT);
+						g = minimum<float>(g, basist::MAX_HALF_FLOAT);
+						b = minimum<float>(b, basist::MAX_HALF_FLOAT);
+					}
+
+					basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b);
+
+					candidate.m_solid_color[0] = rh;
+					candidate.m_solid_color[1] = gh;
+					candidate.m_solid_color[2] = bh;
+
+					candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN);
+
+					candidate.m_coder.put_bits(rh, 15);
+					candidate.m_coder.put_bits(gh, 15);
+					candidate.m_coder.put_bits(bh, 15);
+
+					vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh));
+
+					for (uint32_t y = 0; y < BLOCK_H; y++)
+						for (uint32_t x = 0; x < BLOCK_W; x++)
+							candidate.m_comp_pixels[y][x] = cp;
+
+					astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk;
+
+					log_blk.clear();
+					log_blk.m_solid_color_flag_hdr = true;
+					log_blk.m_solid_color[0] = rh;
+					log_blk.m_solid_color[1] = gh;
+					log_blk.m_solid_color[2] = bh;
+					log_blk.m_solid_color[3] = basist::float_to_half(1.0f);
+
+					candidate.m_decomp_log_blk = log_blk;
+
+					candidates.emplace_back(std::move(candidate));
+				}
+
+				if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks))
+				{
+					static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 };
+					static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 };
+
+					static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 };
+					static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 };
+
+					static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 };
+					static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 };
+
+					uint32_t total_parts2 = 0, total_parts3 = 0;
+
+					assert(comp_level < 5);
+					if ((very_simple_block) && (comp_level <= 3))
+					{
+						// Block's std dev is so low that 2-3 subsets are unlikely to help much
+						total_parts2 = 0;
+						total_parts3 = 0;
+
+						debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed);
+					}
+					else if (very_complex_block)
+					{
+						total_parts2 = s_parts2_very_complex[comp_level];
+						total_parts3 = s_parts3_very_complex[comp_level];
+
+						if (global_cfg.m_extra_patterns_flag)
+						{
+							total_parts2 += (comp_level == 4) ? 30 : 20;
+							total_parts3 += (comp_level == 4) ? 30 : 20;
+						}
+
+						debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed);
+					}
+					else if (complex_block)
+					{
+						total_parts2 = s_parts2_complex[comp_level];
+						total_parts3 = s_parts3_complex[comp_level];
+
+						if (global_cfg.m_extra_patterns_flag)
+						{
+							total_parts2 += (comp_level == 4) ? 15 : 10;
+							total_parts3 += (comp_level == 4) ? 15 : 10;
+						}
+
+						debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed);
+					}
+					else
+					{
+						// moderate complexity - use defaults
+						total_parts2 = s_parts2_normal[comp_level];
+						total_parts3 = s_parts3_normal[comp_level];
+
+						if (global_cfg.m_extra_patterns_flag)
+						{
+							total_parts2 += 5;
+							total_parts3 += 5;
+						}
+
+						debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed);
+					}
+
+					if (!any_2subset_enabled)
+						total_parts2 = 0;
+
+					if (!any_3subset_enabled)
+						total_parts3 = 0;
+
+					int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2];
+					bool has_estimated_parts2 = false;
+
+					if (total_parts2)
+					{
+						if (global_cfg.m_brute_force_partition_matching)
+						{
+							int candidate_pats2[NUM_UNIQUE_PARTITIONS2];
+							for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++)
+								candidate_pats2[i] = i;
+
+							if (any_2subset_enabled)
+							{
+								estimate_partitions_mode7_and_11(
+									2,
+									NUM_UNIQUE_PARTITIONS2, g_partitions2,
+									NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2,
+									&half_pixels_as_floats[0][0],
+									coptions,
+									total_parts2, best_parts2_mode11, best_parts2_mode7);
+							}
+
+							has_estimated_parts2 = true;
+						}
+						else
+						{
+							if (comp_level >= 1)
+							{
+								const uint32_t MAX_CANDIDATES2 = 48;
+								int candidate_pats2[MAX_CANDIDATES2 * 2];
+
+								uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2));
+								num_candidate_pats2 = minimum<uint32_t>(num_candidate_pats2, (uint32_t)std::size(candidate_pats2));
+
+								has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2);
+
+								if (has_estimated_parts2)
+								{
+									estimate_partitions_mode7_and_11(
+										2,
+										NUM_UNIQUE_PARTITIONS2, g_partitions2,
+										num_candidate_pats2, (uint32_t*)candidate_pats2,
+										&half_pixels_as_floats[0][0],
+										coptions,
+										total_parts2, best_parts2_mode11, best_parts2_mode7);
+								}
+							}
+							else
+							{
+								has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2);
+
+								if ((has_estimated_parts2) && (any_2subset_mode7_enabled))
+									memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0]));
+							}
+						}
+					}
+
+					int best_parts3[NUM_UNIQUE_PARTITIONS3];
+					bool has_estimated_parts3 = false;
+
+					if (total_parts3)
+					{
+#if 0
+						has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3);
+#elif 1
+						if (global_cfg.m_brute_force_partition_matching)
+						{
+							int candidate_pats3[NUM_UNIQUE_PARTITIONS3];
+							for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++)
+								candidate_pats3[i] = i;
+
+							estimate_partitions_mode7(
+								3,
+								NUM_UNIQUE_PARTITIONS3, g_partitions3,
+								NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3,
+								&half_pixels_as_floats[0][0],
+								coptions,
+								total_parts3, (uint32_t*)best_parts3);
+
+							has_estimated_parts3 = true;
+						}
+						else
+						{
+							const uint32_t MAX_CANDIDATES3 = 48;
+							int candidate_pats3[MAX_CANDIDATES3 * 2];
+
+							uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2));
+							num_candidate_pats3 = minimum<uint32_t>(num_candidate_pats3, (uint32_t)std::size(candidate_pats3));
+
+							has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3);
+
+							if (has_estimated_parts3)
+							{
+								estimate_partitions_mode7(
+									3,
+									NUM_UNIQUE_PARTITIONS3, g_partitions3,
+									num_candidate_pats3, (uint32_t*)candidate_pats3,
+									&half_pixels_as_floats[0][0],
+									coptions,
+									total_parts3, (uint32_t*)best_parts3);
+							}
+						}
+#endif
+					}
+
+					const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares;
+
+					// ---- Encoded block candidate
+					for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++)
+					{
+						const block_mode bm = (block_mode)block_mode_iter;
+
+						if (comp_level == 0)
+						{
+							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
+								continue;
+						}
+						else if (comp_level == 1)
+						{
+							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
+								continue;
+						}
+						else if (comp_level == 2)
+						{
+							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
+								continue;
+						}
+
+						if (global_cfg.m_block_stat_optimizations_flag)
+						{
+							if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp))
+							{
+								if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
+								{
+									if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2)
+										continue;
+								}
+								else
+								{
+									if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan)
+										continue;
+								}
+							}
+
+							if (comp_level <= 3)
+							{
+								const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x;
+								const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y;
+
+								if (!g_block_mode_descs[block_mode_iter].m_dp)
+								{
+									// Minor gain (.5-1% less canidates)
+									if (very_detailed_block)
+									{
+										if (grid_x * grid_y <= 12)
+										{
+											debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed);
+											continue;
+										}
+									}
+
+									// Major gains (10-25% less candidates)
+									if (very_blurry_block)
+									{
+										if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
+										{
+											debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed);
+											continue;
+										}
+									}
+									if (super_blurry_block)
+									{
+										if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
+										{
+											debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed);
+											continue;
+										}
+									}
+								}
+
+								if (grid_x != grid_y)
+								{
+									if (grid_x < grid_y)
+									{
+										if (!filter_horizontally)
+										{
+											debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed);
+											continue;
+										}
+									}
+									else
+									{
+										if (filter_horizontally)
+										{
+											debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed);
+											continue;
+										}
+									}
+								}
+							}
+
+							if (global_cfg.m_lambda == 0.0f)
+							{
+								// Rarely useful if lambda=0
+								if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
+									continue;
+							}
+						} // block_stat_optimizations_flag
+
+						if ((!use_single_subset_mode7) &&
+							(g_block_mode_descs[block_mode_iter].m_cem == 7) &&
+							(g_block_mode_descs[block_mode_iter].m_num_partitions == 1))
+						{
+							debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed);
+							continue;
+						}
+
+						for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++)
+						{
+							if (global_cfg.m_lambda == 0.0f)
+							{
+								// No use trying anything else
+								if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw)
+									continue;
+							}
+
+							if (global_cfg.m_disable_delta_endpoint_usage)
+							{
+								if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta))
+									continue;
+							}
+
+							if (!global_cfg.m_favor_higher_compression)
+							{
+								if (comp_level == 0)
+								{
+									if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta)
+										continue;
+								}
+
+								if (comp_level <= 1)
+								{
+									if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper))
+										continue;
+								}
+							}
+
+							const endpoint_mode em = (endpoint_mode)endpoint_mode_iter;
+
+							switch (em)
+							{
+							case endpoint_mode::cUseLeft:
+							case endpoint_mode::cUseUpper:
+							{
+								const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
+								const uint32_t cem = local_md.m_cem;
+
+								if (local_md.m_num_partitions > 1)
+									break;
+
+								if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor))
+									break;
+								else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor))
+									break;
+
+								candidate_encoding candidate;
+								candidate.m_coder.reserve(24);
+								astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
+
+								int nx = bx, ny = by;
+								if (em == endpoint_mode::cUseLeft)
+									nx--;
+								else
+									ny--;
+
+								const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
+								if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
+									break;
+								assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));
+
+								const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];
+
+								if (neighbor_md.m_cem != cem)
+									break;
+
+								assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem);
+
+								const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
+								const bool dual_plane = local_md.m_dp;
+								const uint32_t num_grid_samples = grid_x * grid_y;
+								const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);
+
+								coded_log_blk.m_grid_width = (uint8_t)grid_x;
+								coded_log_blk.m_grid_height = (uint8_t)grid_y;
+								coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
+								coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
+								coded_log_blk.m_num_partitions = 1;
+								coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem;
+								coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
+
+								// We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss).
+								coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range;
+								memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals);
+
+								uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
+
+								// Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding.
+								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
+									neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
+									local_md.m_transcode_endpoint_ise_range, transcode_endpoints);
+
+								// Now encode the block using the transcoded endpoints
+								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+
+								if (cem == 7)
+								{
+									status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
+										astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
+								}
+								else
+								{
+									status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
+										astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
+								}
+								if (!status)
+									break;
+
+								uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
+								if (dual_plane)
+								{
+									eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
+
+									downsample_ise_weights_dual_plane(
+										local_md.m_weight_ise_range, local_md.m_weight_ise_range,
+										BLOCK_W, BLOCK_H,
+										grid_x, grid_y,
+										trial_weights0, trial_weights1, coded_log_blk.m_weights);
+								}
+								else
+								{
+									eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
+
+									downsample_ise_weights(
+										local_md.m_weight_ise_range, local_md.m_weight_ise_range,
+										BLOCK_W, BLOCK_H,
+										grid_x, grid_y,
+										trial_weights0, coded_log_blk.m_weights);
+								}
+
+								// Transcode these codable weights to ASTC weights.
+								uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
+								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);
+
+								// Create the block the decoder would transcode into.
+								astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
+								decomp_blk.clear();
+
+								decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
+								decomp_blk.m_dual_plane = local_md.m_dp;
+								decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
+								decomp_blk.m_num_partitions = 1;
+								decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
+								decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;
+
+								memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);
+
+								copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
+
+								if (!validate_log_blk(decomp_blk))
+								{
+									fmt_error_printf("pack_astc_block() failed\n");
+									return false;
+								}
+
+								status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
+								if (!status)
+								{
+									fmt_error_printf("decode_astc_block() failed\n");
+									return false;
+								}
+
+								candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
+								code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr);
+
+								candidate.m_encoding_type = encoding_type::cBlock;
+								candidate.m_endpoint_mode = em;
+								candidate.m_block_mode = bm;
+
+								candidates.emplace_back(std::move(candidate));
+
+								break;
+							}
+							case endpoint_mode::cUseLeftDelta:
+							case endpoint_mode::cUseUpperDelta:
+							{
+								const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
+								const uint32_t cem = local_md.m_cem;
+
+								if (local_md.m_num_partitions > 1)
+									break;
+
+								if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor))
+									break;
+								else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor))
+									break;
+
+								candidate_encoding candidate;
+								candidate.m_coder.reserve(24);
+								astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
+
+								int nx = bx, ny = by;
+								if (em == endpoint_mode::cUseLeftDelta)
+									nx--;
+								else
+									ny--;
+
+								const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
+								if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
+									break;
+								assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));
+
+								const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];
+
+								if (neighbor_md.m_cem != cem)
+									break;
+
+								assert(neighbor_md.m_cem == local_md.m_cem);
+
+								const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
+								const bool dual_plane = local_md.m_dp;
+								const uint32_t num_grid_samples = grid_x * grid_y;
+								const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);
+
+								// Dequantize neighbor's endpoints to ISE 20
+								uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS];
+								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
+									neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
+									astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20);
+
+								// Requantize neighbor's endpoints to our local desired coding ISE range
+								uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS];
+								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local);
+
+								uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS];
+								uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS];
+
+								// Now try to encode the current block using the neighbor's endpoints submode.
+								double err = 0.0f;
+								uint32_t best_submode = 0;
+
+								if (cem == 7)
+								{
+									int maj_index, submode_index;
+									decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index);
+
+									int first_submode = submode_index, last_submode = submode_index;
+
+									err = encode_astc_hdr_block_mode_7(
+										NUM_BLOCK_PIXELS,
+										(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
+										local_md.m_weight_ise_range,
+										best_submode,
+										BIG_FLOAT_VAL,
+										blk_endpoints, blk_weights0,
+										coptions,
+										local_md.m_endpoint_ise_range,
+										first_submode, last_submode,
+										&enc_block_stats);
+								}
+								else
+								{
+									int maj_index, submode_index;
+									decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index);
+
+									int first_submode = -1, last_submode = -1;
+									if (maj_index == 3)
+									{
+										// direct
+									}
+									else
+									{
+										first_submode = submode_index;
+										last_submode = submode_index;
+									}
+
+									if (dual_plane)
+									{
+										err = encode_astc_hdr_block_mode_11_dual_plane(
+											NUM_BLOCK_PIXELS,
+											(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
+											local_md.m_dp_channel,
+											local_md.m_weight_ise_range,
+											best_submode,
+											BIG_FLOAT_VAL,
+											blk_endpoints, blk_weights0, blk_weights1,
+											coptions,
+											false,
+											local_md.m_endpoint_ise_range,
+											false, //uber_mode_flag,
+											false,
+											first_submode, last_submode, true);
+									}
+									else
+									{
+										err = encode_astc_hdr_block_mode_11(
+											NUM_BLOCK_PIXELS,
+											(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
+											local_md.m_weight_ise_range,
+											best_submode,
+											BIG_FLOAT_VAL,
+											blk_endpoints, blk_weights0,
+											coptions,
+											false,
+											local_md.m_endpoint_ise_range,
+											false, //uber_mode_flag,
+											false,
+											first_submode, last_submode, true,
+											mode11_opt_mode,
+											&enc_block_stats);
+									}
+								}
+
+								if (err == BIG_FLOAT_VAL)
+									break;
+
+								uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS];
+
+								// TODO: For now, just try 5 bits for each endpoint. Can tune later.
+								// This isn't right, it's computing the deltas in ISE space.
+								//const uint32_t NUM_ENDPOINT_DELTA_BITS = 5;
+								const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
+								const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1;
+
+								const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank;
+
+								bool all_deltas_in_limits = true;
+								for (uint32_t i = 0; i < num_endpoint_vals; i++)
+								{
+									int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]];
+
+									if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit))
+										all_deltas_in_limits = false;
+
+									endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit);
+								}
+
+								if (all_deltas_in_limits)
+								{
+									coded_log_blk.m_grid_width = (uint8_t)grid_x;
+									coded_log_blk.m_grid_height = (uint8_t)grid_y;
+									coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
+									coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
+									coded_log_blk.m_num_partitions = 1;
+									coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
+									coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
+									coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range;
+
+									memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals);
+
+									uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
+									uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
+
+									basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints);
+
+									if (dual_plane)
+									{
+										downsample_ise_weights_dual_plane(
+											local_md.m_weight_ise_range, local_md.m_weight_ise_range,
+											BLOCK_W, BLOCK_H,
+											grid_x, grid_y,
+											blk_weights0, blk_weights1,
+											coded_log_blk.m_weights);
+									}
+									else
+									{
+										downsample_ise_weights(
+											local_md.m_weight_ise_range, local_md.m_weight_ise_range,
+											BLOCK_W, BLOCK_H,
+											grid_x, grid_y,
+											blk_weights0, coded_log_blk.m_weights);
+									}
+
+									basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);
+
+									// Create the block the decoder would transcode into.
+
+									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
+									decomp_blk.clear();
+
+									decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
+									decomp_blk.m_dual_plane = local_md.m_dp;
+									decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
+									decomp_blk.m_num_partitions = 1;
+									decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
+									decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;
+
+									memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);
+
+									copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
+
+									if (!validate_log_blk(decomp_blk))
+									{
+										fmt_error_printf("pack_astc_block() failed\n");
+										return false;
+									}
+
+									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
+									if (!status)
+									{
+										fmt_error_printf("decode_astc_block() failed\n");
+										return false;
+									}
+
+									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
+									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas);
+
+									candidate.m_encoding_type = encoding_type::cBlock;
+									candidate.m_endpoint_mode = em;
+									candidate.m_block_mode = bm;
+
+									candidates.emplace_back(std::move(candidate));
+								}
+
+								break;
+							}
+							case endpoint_mode::cRaw:
+							{
+								//if (candidates.size() == 339)
+								//	fmt_printf("!");
+
+								const auto& mode_desc = g_block_mode_descs[(uint32_t)bm];
+								const uint32_t cem = mode_desc.m_cem;
+								//const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem);
+								const bool dual_plane = mode_desc.m_dp;
+
+								if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2))
+									break;
+
+								if (mode_desc.m_num_partitions == 3)
+								{
+									assert(!dual_plane);
+
+									if (!has_estimated_parts3)
+										break;
+
+									assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
+									assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);
+
+									trial_result res;
+
+									status = encode_block_3_subsets(
+										res,
+										cem,
+										mode_desc.m_grid_x, mode_desc.m_grid_y,
+										mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
+										&half_pixels[0][0], (vec4F*)block_pixels_q16,
+										coptions,
+										uber_mode_flag,
+										best_parts3, total_parts3, comp_level, mode11_opt_mode);
+
+									if (!status)
+										break;
+
+									assert(res.m_valid);
+
+									candidate_encoding candidate;
+									candidate.m_coder.reserve(24);
+									astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
+
+									coded_log_blk = res.m_log_blk;
+
+									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
+									decomp_blk = res.m_log_blk;
+
+									if (!validate_log_blk(decomp_blk))
+									{
+										fmt_error_printf("pack_astc_block() failed\n");
+										return false;
+									}
+
+									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
+									if (!status)
+									{
+										fmt_error_printf("decode_astc_block() failed\n");
+										return false;
+									}
+
+									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
+									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
+
+									candidate.m_encoding_type = encoding_type::cBlock;
+									candidate.m_endpoint_mode = em;
+									candidate.m_block_mode = bm;
+
+									candidates.emplace_back(std::move(candidate));
+								}
+								else if (mode_desc.m_num_partitions == 2)
+								{
+									assert(!dual_plane);
+
+									if (!has_estimated_parts2)
+										break;
+
+									assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
+									assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);
+
+									for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++)
+									{
+										trial_result results[2];
+
+										assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled));
+
+										status = encode_block_2_subsets(
+											results,
+											mode_desc.m_grid_x, mode_desc.m_grid_y,
+											mode_desc.m_cem,
+											mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
+											&half_pixels[0][0], (vec4F*)block_pixels_q16,
+											coptions,
+											uber_mode_flag,
+											(cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter],
+											comp_level,
+											mode11_opt_mode,
+											true);
+
+										if (!status)
+											continue;
+
+										for (uint32_t r_iter = 0; r_iter < 2; r_iter++)
+										{
+											const trial_result& res = results[r_iter];
+
+											if (!res.m_valid)
+												continue;
+
+											candidate_encoding candidate;
+											candidate.m_coder.reserve(24);
+											astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
+
+											coded_log_blk = res.m_log_blk;
+
+											astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
+											decomp_blk = res.m_log_blk;
+
+											if (!validate_log_blk(decomp_blk))
+											{
+												fmt_error_printf("pack_astc_block() failed\n");
+												return false;
+											}
+
+											status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
+											if (!status)
+											{
+												fmt_error_printf("decode_astc_block() failed\n");
+												return false;
+											}
+
+											candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
+											code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
+
+											candidate.m_encoding_type = encoding_type::cBlock;
+											candidate.m_endpoint_mode = em;
+											candidate.m_block_mode = bm;
+
+											candidates.emplace_back(std::move(candidate));
+
+										} // r_iter
+									}
+								}
+								else
+								{
+									// 1 subset
+									uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H];
+									uint32_t best_submode = 0;
+
+									candidate_encoding candidate;
+									candidate.m_coder.reserve(24);
+									astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
+
+									const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y;
+									const uint32_t num_grid_samples = grid_x * grid_y;
+
+									const half_vec3* pBlock_pixels_half = &half_pixels[0][0];
+									const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0];
+
+									const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1);
+
+									uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
+
+									coded_log_blk.m_grid_width = (uint8_t)grid_x;
+									coded_log_blk.m_grid_height = (uint8_t)grid_y;
+									coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
+									coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
+									coded_log_blk.m_num_partitions = 1;
+									coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
+									coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range;
+									coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range;
+
+									if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
+									{
+										double e = encode_astc_hdr_block_downsampled_mode_11(
+											BLOCK_W, BLOCK_H, grid_x, grid_y,
+											mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
+											NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
+											BIG_FLOAT_VAL,
+											FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode,
+											coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode,
+											coptions,
+											&enc_block_stats);
+
+										if (e == BIG_FLOAT_VAL)
+											break;
+									}
+									else
+									{
+										if (cem == 7)
+										{
+											assert(!dual_plane);
+
+											double e = encode_astc_hdr_block_mode_7(
+												NUM_BLOCK_PIXELS,
+												(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
+												mode_desc.m_weight_ise_range,
+												best_submode,
+												BIG_FLOAT_VAL,
+												coded_log_blk.m_endpoints,
+												blk_weights0,
+												coptions,
+												mode_desc.m_endpoint_ise_range,
+												0, MAX_MODE7_SUBMODE_INDEX,
+												&enc_block_stats);
+											BASISU_NOTE_UNUSED(e);
+										}
+										else
+										{
+											double e;
+
+											if (dual_plane)
+											{
+												e = encode_astc_hdr_block_mode_11_dual_plane(
+													NUM_BLOCK_PIXELS,
+													(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
+													mode_desc.m_dp_channel,
+													mode_desc.m_weight_ise_range,
+													best_submode,
+													BIG_FLOAT_VAL,
+													coded_log_blk.m_endpoints,
+													blk_weights0, blk_weights1,
+													coptions,
+													false,
+													mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false);
+											}
+											else
+											{
+												e = encode_astc_hdr_block_mode_11(
+													NUM_BLOCK_PIXELS,
+													(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
+													mode_desc.m_weight_ise_range,
+													best_submode,
+													BIG_FLOAT_VAL,
+													coded_log_blk.m_endpoints,
+													blk_weights0,
+													coptions,
+													false,
+													mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
+													mode11_opt_mode,
+													&enc_block_stats);
+											}
+
+											if (e == BIG_FLOAT_VAL)
+												break;
+										}
+
+										if (dual_plane)
+										{
+											downsample_ise_weights_dual_plane(
+												mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
+												BLOCK_W, BLOCK_H,
+												grid_x, grid_y,
+												blk_weights0, blk_weights1,
+												coded_log_blk.m_weights);
+										}
+										else
+										{
+											downsample_ise_weights(
+												mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
+												BLOCK_W, BLOCK_H,
+												grid_x, grid_y,
+												blk_weights0, coded_log_blk.m_weights);
+
+											if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
+											{
+												bool refine_status = refine_endpoints(cem,
+													mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints,
+													6, 6, mode_desc.m_grid_x, mode_desc.m_grid_y,
+													coded_log_blk.m_weights, mode_desc.m_weight_ise_range,
+													BLOCK_W * BLOCK_H,
+													(basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16,
+													nullptr,
+													coptions, mode11_opt_mode);
+												BASISU_NOTE_UNUSED(refine_status);
+											}
+										}
+									}
+
+									basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range);
+
+									// Create the block the decoder would transcode into.
+									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
+									decomp_blk.clear();
+
+									decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
+									decomp_blk.m_dual_plane = mode_desc.m_dp;
+									decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
+									decomp_blk.m_num_partitions = 1;
+									decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range;
+									decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range;
+
+									basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
+
+									copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
+
+									if (!validate_log_blk(decomp_blk))
+									{
+										fmt_error_printf("pack_astc_block() failed\n");
+										return false;
+									}
+
+									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
+									if (!status)
+									{
+										fmt_error_printf("decode_astc_block() failed\n");
+										return false;
+									}
+
+									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
+									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
+
+									candidate.m_encoding_type = encoding_type::cBlock;
+									candidate.m_endpoint_mode = em;
+									candidate.m_block_mode = bm;
+
+									candidates.emplace_back(std::move(candidate));
+								}
+
+								break;
+							}
+							default:
+								assert(0);
+								fmt_debug_printf("Invalid endpoint mode\n");
+								return false;
+
+							} // switch (em)
+
+						} // endpoint_mode_iter
+
+					} // block_mode_iter
+
+				} // is_solid_block
+
+				//------------------------------------------------
+
+				debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed);
+				atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32());
+
+				for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
+				{
+					auto& candidate = candidates[candidate_iter];
+
+					for (uint32_t y = 0; y < BLOCK_H; y++)
+						for (uint32_t x = 0; x < BLOCK_W; x++)
+							linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg);
+				}
+
+				// Find best overall candidate
+				double best_t = BIG_FLOAT_VAL;
+				int best_candidate_index = -1;
+
+				float best_d_ssim = BIG_FLOAT_VAL;
+
+				if (global_cfg.m_lambda == 0.0f)
+				{
+					for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
+					{
+						const auto& candidate = candidates[candidate_iter];
+
+						float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);
+
+						if (candidate_d_ssim < best_d_ssim)
+							best_d_ssim = candidate_d_ssim;
+
+						candidate_d_ssim *= SSIM_WEIGHT;
+
+						float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);
+
+						candidate_mse += candidate_d_ssim;
+
+						float total_deblock_penalty = 0.0f;
+						if (global_cfg.m_deblocking_flag)
+						{
+							total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
+						}
+						candidate_mse += total_deblock_penalty * SSIM_WEIGHT;
+
+						if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
+						{
+							// Bias the encoder away from 2 level blocks on complex blocks
+							// TODO: Perhaps only do this on large or non-interpolated grids
+							if (complex_block)
+							{
+								if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
+								{
+									candidate_mse *= TWO_LEVEL_PENALTY;
+								}
+							}
+
+							// Bias the encoder away from smaller weight grids if the block is very complex
+							// TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling.
+							if (complex_block)
+							{
+								if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
+									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
+								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
+									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
+								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
+									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
+							}
+						}
+
+						float candidate_t = candidate_mse;
+
+						if (candidate_t < best_t)
+						{
+							best_t = candidate_t;
+							best_candidate_index = candidate_iter;
+						}
+
+					} // candidate_iter
+
+					if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
+					{
+						debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
+						continue;
+					}
+
+					const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);
+
+					if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
+						(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
+						(block_avg_y >= 1.5f))
+					{
+						debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
+						continue;
+					}
+				}
+				else
+				{
+					assert(enc_state.smooth_block_mse_scales.get_width() > 0);
+
+					// Compute block's perceptual weighting
+					float perceptual_scale = 0.0f;
+					for (uint32_t y = 0; y < BLOCK_H; y++)
+						for (uint32_t x = 0; x < BLOCK_W; x++)
+							perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y));
+
+					// Very roughly normalize the computed distortion vs. bits.
+					perceptual_scale *= 10.0f;
+
+					for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
+					{
+						auto& candidate = candidates[candidate_iter];
+
+						float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);
+
+						if (d_ssim < best_d_ssim)
+							best_d_ssim = (float)d_ssim;
+
+						d_ssim *= SSIM_WEIGHT;
+
+						float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);
+
+						candidate_mse += d_ssim;
+
+						float total_deblock_penalty = 0.0f;
+						if (global_cfg.m_deblocking_flag)
+						{
+							total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
+						}
+						candidate_mse += total_deblock_penalty * SSIM_WEIGHT;
+
+						if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
+						{
+							// Bias the encoder away from 2 level blocks on complex blocks
+							if (complex_block)
+							{
+								if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
+								{
+									candidate_mse *= TWO_LEVEL_PENALTY;
+								}
+							}
+
+							// Bias the encoder away from smaller weight grids if the block is very complex
+							if (complex_block)
+							{
+								if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
+									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
+								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
+									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
+								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
+									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
+							}
+						}
+
+						float mode_penalty = 1.0f;
+						if (candidate.m_encoding_type == encoding_type::cSolid)
+							mode_penalty *= SOLID_PENALTY;
+						else if (candidate.m_encoding_type == encoding_type::cReuse)
+							mode_penalty *= REUSE_PENALTY;
+						else if (candidate.m_encoding_type == encoding_type::cRun)
+							mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY);
+
+						float candidate_bits = (float)candidate.m_coder.get_total_bits();
+						float candidate_d = candidate_mse * mode_penalty;
+
+						const float D_POWER = 2.0f;
+						float candidate_t = perceptual_scale * powf(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f);
+
+						candidate.m_t = candidate_t;
+						candidate.m_d = candidate_d;
+						candidate.m_bits = candidate_bits;
+
+						if (candidate_t < best_t)
+						{
+							best_t = candidate_t;
+							best_candidate_index = candidate_iter;
+						}
+
+					} // candidate_iter
+
+					if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
+					{
+						debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
+						continue;
+					}
+
+					const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);
+
+					if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
+						(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
+						(block_avg_y >= 1.5f))
+					{
+						debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
+						continue;
+					}
+
+					if (global_cfg.m_rdo_candidate_diversity_boost)
+					{
+						// candidate diversity boosting - consider candidates along/near the Pareto front
+						const candidate_encoding& comp_candidate = candidates[best_candidate_index];
+
+						float best_d = BIG_FLOAT_VAL;
+
+						for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
+						{
+							const auto& candidate = candidates[candidate_iter];
+
+							if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight)
+							{
+								if (candidate.m_d < best_d)
+								{
+									best_d = candidate.m_d;
+									best_candidate_index = candidate_iter;
+								}
+							}
+						}
+					}
+
+					// candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that
+					if (global_cfg.m_jnd_optimization)
+					{
+						const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index];
+
+						float new_best_candidate_bits = BIG_FLOAT_VAL;
+						int new_best_candidate_index = -1;
+
+						for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
+						{
+							if ((int)candidate_iter == best_candidate_index)
+								continue;
+
+							const auto& candidate = candidates[candidate_iter];
+
+							if (candidate.m_bits >= cur_comp_candidate.m_bits)
+								continue;
+
+							float max_delta_itp = 0.0f;
+							for (uint32_t y = 0; y < BLOCK_H; y++)
+							{
+								for (uint32_t x = 0; x < BLOCK_W; x++)
+								{
+									float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment);
+									max_delta_itp = maximum(max_delta_itp, delta_itp);
+
+									if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
+										goto skip;
+								}
+							}
+
+						skip:
+							if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
+								continue;
+
+							if (candidate.m_bits < new_best_candidate_bits)
+							{
+								new_best_candidate_bits = candidate.m_bits;
+								new_best_candidate_index = candidate_iter;
+							}
+						}
+
+						if (new_best_candidate_index != -1)
+						{
+							best_candidate_index = new_best_candidate_index;
+							debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed);
+						}
+					}
+
+				} // if (lambda == 0.0f)
+
+				if (global_cfg.m_debug_images)
+				{
+					std::lock_guard<std::mutex> lck(debug_state.m_stat_vis_mutex);
+					debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f));
+				}
+
+				if (best_candidate_index < 0)
+				{
+					assert(best_candidate_index >= 0);
+					fmt_error_printf("No candidates!\n");
+					return false;
+				}
+
+				const auto& best_candidate = candidates[best_candidate_index];
+
+				assert(best_candidate.m_encoding_type != encoding_type::cInvalid);
+
+				if (best_candidate.m_encoding_type == encoding_type::cRun)
+				{
+					if (!prev_run_len)
+					{
+						if (prev_encoding.get_total_bits())
+						{
+#if SYNC_MARKERS
+							strip_coded_bits.put_bits(0xDEAD, 16);
+#endif
+
+							strip_coded_bits.append(prev_encoding);
+						}
+
+						assert(best_candidate.m_coder.get_total_bits());
+
+						prev_encoding = best_candidate.m_coder;
+
+						prev_run_len = 1;
+					}
+					else
+					{
+						prev_run_len++;
+
+						const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
+						assert(prev_run_bits);
+						BASISU_NOTE_UNUSED(prev_run_bits);
+
+						const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32();
+						BASISU_NOTE_UNUSED(num_dummy_bits);
+
+						// Rewrite the previous encoding to extend the run length.
+						prev_encoding.restart();
+						prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN);
+						prev_encoding.put_vlc(prev_run_len - 1, 5);
+
+						assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits);
+					}
+				}
+				else
+				{
+					if (prev_encoding.get_total_bits())
+					{
+#if SYNC_MARKERS
+						strip_coded_bits.put_bits(0xDEAD, 16);
+#endif
+
+						strip_coded_bits.append(prev_encoding);
+					}
+
+					prev_encoding = best_candidate.m_coder;
+					prev_run_len = 0;
+				}
+
+				memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H);
+
+				prev_candidate_encoding = best_candidate;
+
+				if (best_candidate.m_encoding_type != encoding_type::cRun)
+					prev_non_run_candidate_encoding = best_candidate;
+
+				{
+					std::lock_guard<std::mutex> lck(debug_state.m_stats_mutex);
+
+					debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++;
+
+					if (best_candidate.m_encoding_type == encoding_type::cBlock)
+					{
+						debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++;
+					}
+
+					if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock))
+					{
+						const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode;
+						assert(bm_index < (uint32_t)block_mode::cBMTotalModes);
+
+						debug_state.m_block_mode_hist[bm_index]++;
+						debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits();
+
+						for (uint32_t i = 0; i < 3; i++)
+						{
+							debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]);
+							debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]);
+						}
+					}
+
+					if (best_candidate.m_encoding_type == encoding_type::cReuse)
+					{
+						debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed);
+
+						if (best_candidate.m_coded_log_blk.m_dual_plane)
+							debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed);
+					}
+				}
+
+				enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding;
+
+				// Update decoded image
+				vec4F decoded_float_pixels[BLOCK_H][BLOCK_W];
+				for (uint32_t y = 0; y < BLOCK_H; y++)
+					for (uint32_t x = 0; x < BLOCK_W; x++)
+						decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x];
+
+				enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
+
+				status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr);
+				if (!status)
+				{
+					fmt_error_printf("Failed packing block\n");
+					return false;
+				}
+
+				const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed);
+				if ((r & 2047) == 2047)
+				{
+					if (global_cfg.m_status_output)
+					{
+						basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks);
+					}
+				}
+
+				if ((global_cfg.m_debug_images) &&
+					((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid)))
+				{
+					std::lock_guard<std::mutex> lck(debug_state.m_vis_image_mutex);
+
+					if (best_candidate.m_decomp_log_blk.m_num_partitions == 2)
+					{
+						const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
+						assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2));
+
+						const partition_pattern_vec& pat = g_partitions2[part2_unique_index];
+
+						for (uint32_t y = 0; y < 6; y++)
+						{
+							for (uint32_t x = 0; x < 6; x++)
+							{
+								const uint32_t p = pat[x + y * 6];
+								debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255));
+							} // x
+						} // y 
+					}
+					else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3)
+					{
+						//part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255));
+
+						const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
+						assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3));
+
+						const partition_pattern_vec& pat = g_partitions3[part3_unique_index];
+
+						for (uint32_t y = 0; y < 6; y++)
+						{
+							for (uint32_t x = 0; x < 6; x++)
+							{
+								const uint32_t p = pat[x + y * 6];
+								color_rgba c(0, 0, 150, 255);
+								if (p == 1)
+									c.set(100, 0, 150, 255);
+								else if (p == 2)
+									c.set(0, 100, 150, 255);
+								debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c);
+							} // x
+						} // y 
+					}
+					else if (best_candidate.m_decomp_log_blk.m_dual_plane)
+					{
+						debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255));
+					}
+					else
+					{
+						debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255));
+					}
+
+					color_rgba c;
+					c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36);
+					debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c);
+
+					c.set(0, 0, 0, 255);
+					if (complex_block)
+						c[0] = 255;
+
+					if (very_complex_block)
+						c[1] = 255;
+
+					if (outer_pass == 2)
+						c[2] = 255;
+					else if (outer_pass == 1)
+						c[2] = 128;
+
+					debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c);
+
+					c.set(0, 255, 0, 255);
+					if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7)
+						c.set(255, 0, 0, 255);
+					debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c);
+
+					switch (best_candidate.m_encoding_type)
+					{
+					case encoding_type::cRun:
+						c.set(0, 0, 0, 255);
+						break;
+					case encoding_type::cSolid:
+						c.set(128, 128, 128, 255); // dark grey
+						break;
+					case encoding_type::cReuse:
+						c.set(255, 255, 0, 255); // yellow
+						break;
+					case encoding_type::cBlock:
+					{
+						switch (best_candidate.m_endpoint_mode)
+						{
+						case endpoint_mode::cRaw:
+							c.set(255, 0, 0, 255); // red
+							break;
+						case endpoint_mode::cUseLeft:
+							c.set(0, 0, 255, 255); // blue
+							break;
+						case endpoint_mode::cUseUpper:
+							c.set(0, 0, 192, 255); // darker blue
+							break;
+						case endpoint_mode::cUseLeftDelta:
+							c.set(0, 255, 0, 255); // green
+							break;
+						case endpoint_mode::cUseUpperDelta:
+							c.set(0, 192, 0, 255); // darker green
+							break;
+						default:
+							break;
+						}
+
+						break;
+					}
+					default:
+						break;
+					}
+
+					if (filtered_x_err < filtered_y_err)
+						c[3] = 0;
+					else
+						c[3] = 255;
+
+					debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c);
+				}
+
+				break;
+
+			} // outer_pass
+
+		} // bx
+
+	} // by
+
+	if (prev_encoding.get_total_bits())
+	{
+#if SYNC_MARKERS
+		strip_coded_bits.put_bits(0xDEAD, 16);
+#endif
+
+		strip_coded_bits.append(prev_encoding);
+	}
+
+	return true;
+}
+
+bool g_initialized = false;
+
+void global_init()
+{
+	if (g_initialized)
+		return;
+
+	interval_timer tm;
+	tm.start();
+
+	init_pq_tables();
+		
+	init_partitions2_6x6();
+	init_partitions3_6x6();
+
+	init_contrib_lists();
+
+	g_initialized = true;
+
+	//fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs());
+}
+
+bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool,
+	basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics)
+{
+	assert(g_initialized);
+	if (!g_initialized)
+		return false;
+	
+	assert(pJob_pool);
+
+	if (orig_global_cfg.m_debug_output)
+	{
+		fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n");
+		fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height());
+		fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads());
+		orig_global_cfg.print();
+	}
+
+	if (!orig_src_img.get_width() || !orig_src_img.get_height())
+	{
+		assert(false);
+		fmt_error_printf("compress_photo: Invalid source image\n");
+		return false;
+	}
+
+	astc_hdr_6x6_global_config global_cfg(orig_global_cfg);
+
+	uastc_hdr_6x6_encode_state enc_state;
+	enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6;
+	enc_state.src_img = orig_src_img;
+
+	//src_img.crop(256, 256);
+
+	const uint32_t width = enc_state.src_img.get_width();
+	const uint32_t height = enc_state.src_img.get_height();
+	const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W);
+	const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H);
+	const uint32_t total_blocks = num_blocks_x * num_blocks_y;
+
+	for (uint32_t y = 0; y < height; y++)
+	{
+		for (uint32_t x = 0; x < width; x++)
+		{
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				float f = enc_state.src_img(x, y)[c];
+
+				if (std::isinf(f) || std::isnan(f) || (f < 0.0f))
+					f = 0;
+				else if (f > basist::ASTC_HDR_MAX_VAL)
+					f = basist::ASTC_HDR_MAX_VAL;
+
+				enc_state.src_img(x, y)[c] = f;
+								
+			} // c
+						
+		} // x
+	} // y
+	
+	if (global_cfg.m_debug_images)
+	{
+		write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0);
+	}
+			
+	image src_img_compressed;
+	tonemap_image_compressive2(src_img_compressed, enc_state.src_img);
+
+	if (global_cfg.m_debug_images)
+	{
+		save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed);
+	}
+
+	smooth_map_params rp;
+	rp.m_debug_images = global_cfg.m_debug_images;
+
+	if (global_cfg.m_lambda != 0.0f)
+	{
+		if (global_cfg.m_status_output)
+			fmt_printf("Creating RDO perceptual weighting maps\n");
+
+		create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp);
+	}
+
+	if (global_cfg.m_status_output)
+		fmt_printf("Blurring image\n");
+
+	enc_state.src_img_filtered1.resize(width, height);
+	image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f);
+	
+	enc_state.src_img_filtered2.resize(width, height);
+	image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f);
+		
+	if (global_cfg.m_debug_images)
+	{
+		write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0);
+		write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0);
+	}
+
+	if (global_cfg.m_status_output)
+		fmt_printf("Transforming to ITP\n");
+
+	enc_state.src_img_itp.resize(width, height);
+	convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg);
+	
+	enc_state.src_img_filtered1_itp.resize(width, height);
+	convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg);
+	
+	enc_state.src_img_filtered2_itp.resize(width, height);
+	convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg);
+
+	if (global_cfg.m_lambda == 0.0f)
+		global_cfg.m_favor_higher_compression = false;
+
+	uint32_t total_strips = 0, rows_per_strip = 0;
+	if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg))
+	{
+		fmt_error_printf("compress_photo: Failed computing strip sizes\n");
+		return false;
+	}
+		
+	if (global_cfg.m_debug_output)
+		fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag);
+					
+	enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y);
+						
+	bitwise_coder coded_bits;
+
+	coded_bits.put_bits(0xABCD, 16);
+	coded_bits.put_bits(width, 16);
+	coded_bits.put_bits(height, 16);
+					
+	enc_state.packed_img.resize(width, height);
+		
+	enc_state.strip_bits.resize(total_strips);
+
+	enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y);
+
+	uastc_hdr_6x6_debug_state debug_state;
+
+	if (global_cfg.m_debug_images)
+		debug_state.init(width, height);
+	else
+		debug_state.init(0, 0);
+		
+	interval_timer tm;
+	tm.start();
+
+	std::atomic_bool any_failed_flag;
+	any_failed_flag.store(false);
+
+	for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
+	{
+		const uint32_t strip_first_by = strip_index * rows_per_strip;
+		
+		uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
+		if (strip_index == (total_strips - 1))
+			strip_last_by = num_blocks_y - 1;
+
+		pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state,
+			strip_index, total_strips, strip_first_by, strip_last_by,
+			num_blocks_x, num_blocks_y, total_blocks, width, height]
+		{
+			if (!any_failed_flag)
+			{
+				bool status = compress_strip_task(
+					strip_index, total_strips, strip_first_by, strip_last_by,
+					num_blocks_x, num_blocks_y, total_blocks, width, height,
+					global_cfg, debug_state, enc_state);
+
+				if (!status)
+				{
+					fmt_error_printf("compress_photo: compress_strip_task() failed\n");
+					any_failed_flag.store(true, std::memory_order_relaxed);
+				}
+			}
+		} );
+
+		if (any_failed_flag)
+			break;
+	
+	} // strip_index
+
+	pJob_pool->wait_for_all();
+
+	if (any_failed_flag)
+	{
+		fmt_error_printf("One or more strips failed during compression\n");
+		return false;
+	}
+				
+	if (global_cfg.m_debug_output)
+		fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs());
+
+	if (global_cfg.m_debug_output)
+		debug_state.print(total_blocks);
+
+	if (global_cfg.m_debug_images)
+	{
+		save_png(global_cfg.m_debug_image_prefix +  "part_vis.png", debug_state.m_part_vis);
+		save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis);
+		save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis);
+		save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2);
+		save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis);
+		write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0);
+	}
+
+	for (uint32_t i = 0; i < total_strips; i++)
+		coded_bits.append(enc_state.strip_bits[i]);
+		
+	coded_bits.put_bits(0xA742, 16);
+
+	coded_bits.flush();
+
+	if (global_cfg.m_output_images)
+	{
+		write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0);
+	}
+	
+	if (global_cfg.m_debug_output)
+		fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height));
+
+	vector2D<astc_helpers::astc_block> decoded_blocks1;
+	vector2D<astc_helpers::astc_block> decoded_blocks2;
+	
+	if (global_cfg.m_debug_output)
+		fmt_printf("decode_file\n");
+
+	uint32_t unpacked_width = 0, unpacked_height = 0;
+	bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height);
+	if (!status)
+	{
+		fmt_error_printf("decode_file() failed\n");
+		return false;
+	}
+
+	if (global_cfg.m_debug_output)
+		fmt_printf("decode_6x6_hdr\n");
+
+	status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height);
+	if (!status)
+	{
+		fmt_error_printf("decode_6x6_hdr_file() failed\n");
+		return false;
+	}
+
+	if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) ||
+		(enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height()))
+	{
+		fmt_error_printf("Decode size mismatch with decode_file\n");
+		return false;
+	}
+
+	if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) ||
+		(enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height()))
+	{
+		fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n");
+		return false;
+	}
+
+	if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0)
+	{
+		fmt_error_printf("Decoded ASTC blocks verification failed\n");
+		return false;
+	}
+
+	if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0)
+	{
+		fmt_error_printf("Decoded ASTC blocks verification failed\n");
+		return false;
+	}
+
+	if (global_cfg.m_debug_output)
+		basisu::fmt_printf("Decoded ASTC verification checks succeeded\n");
+
+	if (global_cfg.m_output_images)
+	{
+		if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height))
+		{
+			basisu::platform_sleep(20);
+
+			uint8_vec astc_file_data;
+			if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data))
+			{
+				if (astc_file_data.size() > 16)
+				{
+					astc_file_data.erase(0, 16);
+
+					size_t comp_size = 0;
+					void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);
+					mz_free(pComp_data);
+
+					if (global_cfg.m_debug_output)
+					{
+						fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n",
+							(uint64_t)astc_file_data.size(),
+							(float)astc_file_data.size() * 8.0f / (float)(width * height),
+							(float)comp_size * 8.0f / (float)(width * height));
+					}
+				}
+			}
+		}
+	}
+
+	// Must decode all the blocks (even padded rows/cols) to match what the transcoder does.
+	imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6);
+	imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6);
+
+	for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++)
+	{
+		for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++)
+		{
+			const auto& phys_blk = decoded_blocks1(x, y);
+
+			vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H];
+			status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels);
+			if (!status)
+			{
+				fmt_error_printf("unpack_physical_astc_block() failed\n");
+				return false;
+			}
+			
+			unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);
+
+			vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H];
+			status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google);
+			if (!status)
+			{
+				fmt_error_printf("unpack_physical_astc_block_google() failed\n");
+				return false;
+			}
+
+			unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);
+
+			for (uint32_t i = 0; i < 36; i++)
+			{
+				if (pixels[i] != pixels_google[i])
+				{
+					fmt_error_printf("pixel unpack mismatch\n");
+					return false;
+				}
+			}
+		}
+	}
+		
+	if (global_cfg.m_debug_output)
+		fmt_printf("\nUnpack succeeded\n");
+
+	imagef unpacked_bc6h_img;
+
+	{
+		vector2D<basist::bc6h_block> bc6h_blocks;
+		
+		fast_bc6h_params enc_params;
+						
+		bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params);
+		if (!pack_status)
+		{
+			fmt_error_printf("pack_bc6h_image() failed!");
+			return false;
+		}
+
+		unpacked_bc6h_img.crop(width, height);
+		
+		if (global_cfg.m_output_images)
+		{
+			write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0);
+		}
+	}
+
+	unpacked_astc_img.crop(width, height);
+	unpacked_astc_google_img.crop(width, height);
+	
+	if (global_cfg.m_output_images)
+	{
+		write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0);
+		write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0);
+	}
+
+	// ASTC metrics
+	if (global_cfg.m_image_stats)
+	{
+		image_metrics im;
+
+		if (global_cfg.m_debug_output)
+			printf("\nASTC log2 float error metrics:\n");
+
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true);
+
+			if (global_cfg.m_debug_output)
+			{
+				printf("%c:   ", "RGBA"[i]);
+				im.print_hp();
+			}
+		}
+		
+		metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true);
+
+		if (global_cfg.m_debug_output)
+		{
+			printf("RGB: ");
+			metrics.m_im_astc_log2.print_hp();
+
+			printf("\n");
+		}
+	}
+
+	if (global_cfg.m_image_stats)
+	{
+		image_metrics im;
+
+		if (global_cfg.m_debug_output)
+			printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n");
+
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true);
+
+			if (global_cfg.m_debug_output)
+			{
+				printf("%c:   ", "RGBA"[i]);
+				im.print_hp();
+			}
+		}
+
+		metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true);
+
+		if (global_cfg.m_debug_output)
+		{
+			printf("RGB: ");
+			metrics.m_im_astc_half.print_hp();
+		}
+	}
+
+	// BC6H metrics
+	if (global_cfg.m_image_stats)
+	{
+		image_metrics im;
+
+		if (global_cfg.m_debug_output)
+			printf("\nBC6H log2 float error metrics:\n");
+
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true);
+			
+			if (global_cfg.m_debug_output)
+			{
+				printf("%c:   ", "RGBA"[i]);
+				im.print_hp();
+			}
+		}
+
+		metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true);
+
+		if (global_cfg.m_debug_output)
+		{
+			printf("RGB: ");
+			metrics.m_im_bc6h_log2.print_hp();
+
+			printf("\n");
+		}
+	}
+
+	if (global_cfg.m_image_stats)
+	{
+		image_metrics im;
+		
+		if (global_cfg.m_debug_output)
+			printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n");
+
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true);
+			
+			if (global_cfg.m_debug_output)
+			{
+				printf("%c:   ", "RGBA"[i]);
+				im.print_hp();
+			}
+		}
+
+		metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true);
+		
+		if (global_cfg.m_debug_output)
+		{
+			printf("RGB: ");
+			metrics.m_im_bc6h_half.print_hp();
+
+			printf("\n");
+		}
+	}
+
+	intermediate_tex_data.swap(coded_bits.get_bytes());
+
+	astc_tex_data.resize(decoded_blocks1.size_in_bytes());
+	memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes());
+
+	return true;
+}
+
+} // namespace astc_6x6_hdr

+ 129 - 0
thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h

@@ -0,0 +1,129 @@
+// File: basisu_astc_hdr_6x6_enc.h
+#pragma once
+#include "basisu_enc.h"
+#include "../transcoder/basisu_astc_hdr_core.h"
+
+namespace astc_6x6_hdr
+{
+	const uint32_t ASTC_HDR_6X6_MAX_USER_COMP_LEVEL = 12;
+
+	const uint32_t ASTC_HDR_6X6_MAX_COMP_LEVEL = 4;
+	
+	const float LDR_BLACK_BIAS = 0.0f;// .49f;
+		
+	// Note: This struct is copied several times, so do not place any heavyweight objects in here.
+	struct astc_hdr_6x6_global_config
+	{
+		// Important: The Delta ITP colorspace error metric we use internally makes several assumptions about the nature of the HDR RGB inputs supplied to the encoder.
+		// This encoder computes colorspace error in the ICtCp (or more accurately the delta ITP, where CT is scaled by .5 vs. ICtCp to become T) colorspace, so getting this correct is important.
+		// By default the encoder assumes the input is in absolute luminance (in nits or candela per square meter, cd/m^2), specified as positive-only linear light RGB, using the REC 709 colorspace gamut (but NOT the sRGB transfer function, i.e. linear light).
+		// If the m_rec2020_bt2100_color_gamut flag is true, the input colorspace is treated as REC 2020/BT.2100 (which is wider than 709).
+		// For SDR/LDR->HDR upconversion, the REC 709 sRGB input should be converted to linear light (sRGB->linear) and the resulting normalized linear RGB values scaled by either 80 or 100 nits (the luminance of a typical SDR monitor). 
+		// SDR upconversion to normalized [0,1] (i.e. non-absolute) luminances may work but is not supported because ITP errors will not be predicted correctly.
+		bool m_rec2020_bt2100_color_gamut = false; 
+
+		// levels 0-3 normal levels, 4=exhaustive
+		uint32_t m_master_comp_level = 0;
+		uint32_t m_highest_comp_level = 1;
+
+		float m_lambda = 0.0f;
+
+		bool m_extra_patterns_flag = false; // def to false, works in comp levels [1,4]
+		bool m_brute_force_partition_matching = false; // def to false
+
+		bool m_jnd_optimization = false; // defaults to false for HDR inputs, on SDR upconverted images this can default to enabled
+		float m_jnd_delta_itp_thresh = .75f;
+
+		bool m_force_one_strip = false;
+				
+		bool m_gaussian1_fallback = true; // def to true, if this is disabled m_gaussian2_fallback should be disabled too
+		float m_gaussian1_strength = 1.45f;
+
+		bool m_gaussian2_fallback = true; // def to true, hopefully rarely kicks in
+		float m_gaussian2_strength = 1.83f;
+				
+		// m_disable_delta_endpoint_usage may give a slight increase in RDO ASTC encoding efficiency. It's also faster.
+		bool m_disable_delta_endpoint_usage = false;
+
+		// Scale up Delta ITP errors for very dark pixels, assuming they will be brightly exposed > 1.0x.
+		// We don't know if the output will be exposed, or not. If heavily exposed, our JND calculations will not be conservative enough.
+		bool m_delta_itp_dark_adjustment = true;
+
+		bool m_debug_images = false;
+		std::string m_debug_image_prefix = "dbg_astc_hdr_6x6_devel_";
+
+		bool m_output_images = false;
+		std::string m_output_image_prefix = "dbg_astc_hdr_6x6_output_";
+
+		bool m_debug_output = false;
+		bool m_image_stats = false;
+		bool m_status_output = false;
+
+		//-------------------------------------------------------------------------------------
+		// Very low level/devel parameters - intended for development. Best not to change them.
+		//-------------------------------------------------------------------------------------
+		bool m_deblocking_flag = true;
+		float m_deblock_penalty_weight = .03f;
+		bool m_disable_twothree_subsets = false; // def to false
+		bool m_use_solid_blocks = true; // def to true
+		bool m_use_runs = true; // def to true
+		bool m_block_stat_optimizations_flag = true; // def to true	
+
+		bool m_rdo_candidate_diversity_boost = true; // def to true
+		float m_rdo_candidate_diversity_boost_bit_window_weight = 1.2f;
+
+		bool m_favor_higher_compression = true; // utilize all modes
+		uint32_t m_num_reuse_xy_deltas = basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS;
+
+		void print() const
+		{
+			basisu::fmt_debug_printf("m_master_comp_level: {}, m_highest_comp_level: {}\n", m_master_comp_level, m_highest_comp_level);
+			basisu::fmt_debug_printf("m_lambda: {}\n", m_lambda);
+			basisu::fmt_debug_printf("m_rec2020_bt2100_color_gamut: {}\n", m_rec2020_bt2100_color_gamut);
+			basisu::fmt_debug_printf("m_extra_patterns_flag: {}, m_brute_force_partition_matching: {}\n", m_extra_patterns_flag, m_brute_force_partition_matching);
+			basisu::fmt_debug_printf("m_jnd_optimization: {}, m_jnd_delta_itp_thresh: {}\n", m_jnd_optimization, m_jnd_delta_itp_thresh);
+			basisu::fmt_debug_printf("m_force_one_strip: {}\n", m_force_one_strip);
+			basisu::fmt_debug_printf("m_gaussian1_fallback: {}, m_gaussian1_strength: {}\n", m_gaussian1_fallback, m_gaussian1_strength);
+			basisu::fmt_debug_printf("m_gaussian2_fallback: {}, m_gaussian2_strength: {}\n", m_gaussian2_fallback, m_gaussian2_strength);
+			basisu::fmt_debug_printf("m_disable_delta_endpoint_usage: {}\n", m_disable_delta_endpoint_usage);
+			basisu::fmt_debug_printf("m_delta_itp_dark_adjustment: {}\n", m_delta_itp_dark_adjustment);
+			basisu::fmt_debug_printf("m_debug_images: {}, m_debug_image_prefix: {}\n", m_debug_images, m_debug_image_prefix);
+			basisu::fmt_debug_printf("m_output_images: {}, m_output_image_prefix: {}\n", m_output_images, m_output_image_prefix);
+			basisu::fmt_debug_printf("m_image_stats: {}, m_status_output: {}\n", m_image_stats, m_status_output);
+			basisu::fmt_debug_printf("m_deblocking_flag: {}, m_deblock_penalty_weight: {}\n", m_deblocking_flag, m_deblock_penalty_weight);
+			basisu::fmt_debug_printf("m_disable_twothree_subsets: {}, m_use_solid_blocks: {}\n", m_disable_twothree_subsets, m_use_solid_blocks);
+			basisu::fmt_debug_printf("m_use_runs: {}, m_block_stat_optimizations_flag: {}\n", m_use_runs, m_block_stat_optimizations_flag);
+			basisu::fmt_debug_printf("m_rdo_candidate_diversity_boost: {}, m_rdo_candidate_diversity_boost_bit_window_weight: {}\n", m_rdo_candidate_diversity_boost, m_rdo_candidate_diversity_boost_bit_window_weight);
+			basisu::fmt_debug_printf("m_favor_higher_compression: {}, m_num_reuse_xy_deltas: {}\n", m_favor_higher_compression, m_num_reuse_xy_deltas);
+		}
+				
+		astc_hdr_6x6_global_config()
+		{
+		}
+
+		void clear()
+		{
+			astc_hdr_6x6_global_config def;
+			std::swap(*this, def);
+		}
+
+		// Max level is ASTC_HDR_6X6_MAX_USER_COMP_LEVEL
+		void set_user_level(int level);
+	};
+
+	void global_init();
+
+	struct result_metrics
+	{
+		basisu::image_metrics m_im_astc_log2;
+		basisu::image_metrics m_im_astc_half;
+
+		basisu::image_metrics m_im_bc6h_log2;
+		basisu::image_metrics m_im_bc6h_half;
+	};
+	
+	// The input image should be unpadded to 6x6 boundaries, i.e. the original unexpanded image.
+	bool compress_photo(const basisu::imagef& orig_src_img, const astc_hdr_6x6_global_config& global_cfg, basisu::job_pool* pJob_pool,
+		basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics);
+
+} // namespace uastc_6x6_hdr

+ 5357 - 0
thirdparty/basis_universal/encoder/basisu_astc_hdr_common.cpp

@@ -0,0 +1,5357 @@
+// File: basisu_astc_hdr_common.cpp
+#include "basisu_enc.h"
+#include "basisu_gpu_texture.h"
+#include "../transcoder/basisu_astc_helpers.h"
+#include "../transcoder/basisu_astc_hdr_core.h"
+#include "basisu_astc_hdr_common.h"
+
+using namespace basist;
+
+#ifndef __EMSCRIPTEN__
+	#define BASISU_MULTITHREADED_INIT (0)
+#endif
+
+namespace basisu
+{
+
+const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][33] =
+{
+	{ 2, 0, 64 }, // 0, note ise range=0 is invalid for 4x4 block sizes (<24 weight bits in the block)
+	{ 3, 0, 32, 64 }, // 1
+	{ 4, 0, 21, 43, 64 }, // 2
+	{ 5, 0, 16, 32, 48, 64 }, // 3
+	{ 6, 0, 64, 12, 52, 25, 39 }, // 4
+	{ 8, 0, 9, 18, 27, 37, 46, 55, 64 }, // 5
+	{ 10, 0, 64, 7, 57, 14, 50, 21, 43, 28, 36 }, // 6
+	{ 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7
+	{ 16, 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }, // 8
+	{ 20, 0,64,16,48,3,61,19,45,6,58,23,41,9,55,26,38,13,51,29,35}, // 9
+	{ 24, 0,64,8,56,16,48,24,40,2,62,11,53,19,45,27,37,5,59,13,51,22,42,30,34}, // 10
+	{ 32, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64}, // 11
+};
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+const float DEF_R_ERROR_SCALE = 2.0f;
+const float DEF_G_ERROR_SCALE = 3.0f;
+
+void astc_hdr_codec_base_options::init()
+{
+	m_r_err_scale = DEF_R_ERROR_SCALE;
+	m_g_err_scale = DEF_G_ERROR_SCALE;
+	m_q_log_bias = Q_LOG_BIAS_4x4;
+
+	m_ultra_quant = false;
+
+	// Disabling by default to avoid transcoding outliers (try kodim26). The quality lost is very low. TODO: Could include the uber result in the output.
+	m_allow_uber_mode = false;
+
+	m_mode7_full_s_optimization = true;
+
+	m_take_first_non_clamping_mode11_submode = false;
+	m_take_first_non_clamping_mode7_submode = false;
+
+	m_disable_weight_plane_optimization = true;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+// max usable qlog8 value is 247, 248=inf, >=249 is nan
+// max usable qlog7 value is 123, 124=inf, >=125 is nan
+
+//const uint32_t TOTAL_USABLE_QLOG8 = 248; // 0-247 are usable, 0=0, 247=60416.0, 246=55296.0
+
+// nearest values given a positive half float value (only)
+static uint16_t g_half_to_qlog7[32768], g_half_to_qlog8[32768];
+
+const uint32_t HALF_TO_QLOG_TABS_MIN_BITS = 7;
+const uint32_t HALF_TO_QLOG_TABS_MAX_BITS = 8;
+static uint16_t* g_pHalf_to_qlog_tabs[2] =
+{
+	g_half_to_qlog7,
+	g_half_to_qlog8,
+};
+
+#if 0
+static inline uint32_t half_to_qlog7_8(half_float h, uint32_t bits)
+{
+	assert((bits >= HALF_TO_QLOG_TABS_MIN_BITS) && (bits <= HALF_TO_QLOG_TABS_MAX_BITS));
+	assert(h < 32768);
+
+	return g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_MIN_BITS][h];
+}
+#endif
+
+// TODO: Tune this
+static inline uint32_t quant_qlog16(uint32_t q16, uint32_t desired_bits)
+{
+	assert((desired_bits >= 7) && (desired_bits <= 12));
+	assert(q16 <= 65535);
+
+	const uint32_t shift = 16 - desired_bits;
+	uint32_t e = (q16 + (1U << (shift - 1U)) - 1U) >> shift;
+
+	uint32_t max_val = (1U << desired_bits) - 1U;
+	e = minimum<uint32_t>(e, max_val);
+
+	return e;
+}
+
+static void compute_half_to_qlog_table(uint32_t bits, uint16_t* pTable, const basisu::vector<float>& qlog16_to_float)
+{
+	assert(bits >= 5 && bits <= 12);
+	const uint32_t max_val = (1 << bits) - 1;
+
+	const uint32_t FIRST_INVALID_QLOG16_INDEX = 63488; // first inf, rest are inf/nan's
+	assert(std::isinf(qlog16_to_float[FIRST_INVALID_QLOG16_INDEX]));
+	assert(std::isinf(qlog16_to_float[FIRST_INVALID_QLOG16_INDEX + 1]));
+	assert(!std::isnan(qlog16_to_float[FIRST_INVALID_QLOG16_INDEX - 1]));
+	assert(!std::isinf(qlog16_to_float[FIRST_INVALID_QLOG16_INDEX - 1]));
+
+	// For all positive half-floats
+	for (uint32_t h = 0; h < 32768; h++)
+	{
+		// Skip invalid values
+		if (is_half_inf_or_nan((half_float)h))
+			continue;
+		const float desired_val = half_to_float((half_float)h);
+
+		float best_err = BIG_FLOAT_VAL;
+		uint32_t best_qlog = 0;
+		
+		double prev_err = BIG_FLOAT_VAL;
+
+		// For all possible qlog's
+		for (uint32_t i = 0; i <= max_val; i++)
+		{
+			// Skip invalid values
+			uint32_t idx = i << (16 - bits);
+			if (idx >= FIRST_INVALID_QLOG16_INDEX)
+				break;
+
+			float v = qlog16_to_float[idx];
+			//assert(!std::isinf(v) && !std::isnan(v)); // too clostly in debug
+
+			// Compute error
+			float err = fabsf(v - desired_val);
+
+			if (err > prev_err)
+			{
+				// Every remaining entry will have guaranteed higher error
+				break;
+			}
+
+			prev_err = err;
+						
+			// Find best
+			if (err < best_err)
+			{
+				best_err = err;
+				best_qlog = i;
+				
+				if (best_err == 0.0f)
+					break;
+			}
+		}
+
+		pTable[h] = (uint16_t)best_qlog;
+	}
+}
+
+static void init_qlog_tables()
+{
+	basisu::vector<float> qlog16_to_float(65536);
+
+	// for all possible qlog16, compute the corresponding half float
+	for (uint32_t i = 0; i <= 65535; i++)
+	{
+		half_float h = astc_helpers::qlog16_to_half(i);
+
+		qlog16_to_float[i] = half_to_float(h);
+	}
+
+#if BASISU_MULTITHREADED_INIT
+	job_pool jp(3);
+	
+	for (uint32_t bits = HALF_TO_QLOG_TABS_MIN_BITS; bits <= HALF_TO_QLOG_TABS_MAX_BITS; bits++)
+	{
+		jp.add_job( [bits, &qlog16_to_float]() { compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_MIN_BITS], qlog16_to_float); });
+	}
+
+	jp.wait_for_all();
+#else
+	// for all possible half floats, find the nearest qlog5-12 float
+	for (uint32_t bits = HALF_TO_QLOG_TABS_MIN_BITS; bits <= HALF_TO_QLOG_TABS_MAX_BITS; bits++)
+	{
+		compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_MIN_BITS], qlog16_to_float);
+
+#if 0
+		std::vector<uint16_t> check_tab(32768);
+		compute_half_to_qlog_table_orig(bits, check_tab.data(), qlog16_to_float);
+		for (uint32_t i = 0; i < (1 << bits); i++)
+		{
+			assert(check_tab[i] == g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_MIN_BITS][i]);
+		}
+#endif
+	}
+#endif // BASISU_MULTITHREADED_INIT
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static vec3F calc_mean(uint32_t num_pixels, const vec4F* pPixels)
+{
+	vec3F mean(0.0f);
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		const vec4F& p = pPixels[i];
+
+		mean[0] += p[0];
+		mean[1] += p[1];
+		mean[2] += p[2];
+	}
+
+	return mean / static_cast<float>(num_pixels);
+}
+
+static vec3F calc_rgb_pca(uint32_t num_pixels, const vec4F* pPixels, const vec3F& mean_color)
+{
+	float cov[6] = { 0, 0, 0, 0, 0, 0 };
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		const vec4F& v = pPixels[i];
+
+		float r = v[0] - mean_color[0];
+		float g = v[1] - mean_color[1];
+		float b = v[2] - mean_color[2];
+
+		cov[0] += r * r;
+		cov[1] += r * g;
+		cov[2] += r * b;
+		cov[3] += g * g;
+		cov[4] += g * b;
+		cov[5] += b * b;
+	}
+
+	float xr = .9f, xg = 1.0f, xb = .7f;
+	for (uint32_t iter = 0; iter < 3; iter++)
+	{
+		float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
+		float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
+		float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
+
+		float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
+
+		if (m > 1e-10f)
+		{
+			m = 1.0f / m;
+
+			r *= m;
+			g *= m;
+			b *= m;
+		}
+
+		xr = r;
+		xg = g;
+		xb = b;
+	}
+
+	float len = xr * xr + xg * xg + xb * xb;
+
+	vec3F axis(0.5773502691f);
+
+	if (len >= 1e-10f)
+	{
+		len = 1.0f / sqrtf(len);
+
+		xr *= len;
+		xg *= len;
+		xb *= len;
+
+		axis.set(xr, xg, xb);
+	}
+
+	return axis;
+}
+
+void encode_astc_block_stats::init(uint32_t num_pixels, const vec4F pBlock_pixels_q16[])
+{
+	m_num_pixels = num_pixels;
+	m_mean_q16 = calc_mean(num_pixels, pBlock_pixels_q16);
+	m_axis_q16 = calc_rgb_pca(num_pixels, pBlock_pixels_q16, m_mean_q16);
+}
+
+static vec3F interp_color(const vec3F& mean, const vec3F& dir, float df, const aabb3F& colorspace_box, const aabb3F& input_box, bool* pInside = nullptr)
+{
+#if 0
+	assert(mean[0] >= input_box[0][0]);
+	assert(mean[1] >= input_box[0][1]);
+	assert(mean[2] >= input_box[0][2]);
+	assert(mean[0] <= input_box[1][0]);
+	assert(mean[1] <= input_box[1][1]);
+	assert(mean[2] <= input_box[1][2]);
+#endif
+
+	if (pInside)
+		*pInside = false;
+
+	vec3F k(mean + dir * df);
+	if (colorspace_box.contains(k))
+	{
+		if (pInside)
+			*pInside = true;
+
+		return k;
+	}
+
+	// starts inside
+	vec3F s(mean);
+
+	// ends outside
+	vec3F e(mean + dir * df);
+
+	// a ray guaranteed to go from the outside to inside
+	ray3F r(e, (s - e).normalize_in_place());
+	vec3F c;
+	float t = 0.0f;
+
+	intersection::result res = intersection::ray_aabb(c, t, r, input_box);
+	if (res != intersection::cSuccess)
+		c = k;
+
+	return c;
+}
+
+// all in Q16 space, 0-65535
+static bool compute_least_squares_endpoints_rgb(
+	uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights,
+	vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box)
+{
+	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
+	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		
+		z00 += pSelector_weights[sel][0];
+		z10 += pSelector_weights[sel][1];
+		z11 += pSelector_weights[sel][2];
+
+		float w = pSelector_weights[sel][3];
+
+		q00_r += w * pColors[i][0];
+		t_r += pColors[i][0];
+
+		q00_g += w * pColors[i][1];
+		t_g += pColors[i][1];
+
+		q00_b += w * pColors[i][2];
+		t_b += pColors[i][2];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+
+	z01 = z10;
+
+	float det = z00 * z11 - z01 * z10;
+	if (det == 0.0f)
+		return false;
+
+	det = 1.0f / det;
+
+	float iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+		
+	(*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r);
+	(*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+
+	(*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g);
+	(*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+
+	(*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b);
+	(*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+		float l = (*pXl)[c], h = (*pXh)[c];
+
+		if (input_box.get_dim(c) < .0000125f)
+		{
+			l = input_box[0][c];
+			h = input_box[1][c];
+		}
+		
+		(*pXl)[c] = l;
+		(*pXh)[c] = h;
+	}
+
+	vec3F mean((*pXl + *pXh) * .5f);
+	vec3F dir(*pXh - *pXl);
+
+	float ln = dir.length();
+	if (ln)
+	{
+		dir /= ln;
+
+		float ld = (*pXl - mean).dot(dir);
+		float hd = (*pXh - mean).dot(dir);
+
+		aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL));
+
+		bool was_inside1 = false;
+
+		vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1);
+		if (!was_inside1)
+			*pXl = l;
+
+		bool was_inside2 = false;
+		vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2);
+		if (!was_inside2)
+			*pXh = h;
+	}
+
+	pXl->clamp(0.0f, MAX_QLOG16_VAL);
+	pXh->clamp(0.0f, MAX_QLOG16_VAL);
+
+	return true;
+}
+
+static bool compute_least_squares_endpoints_rgb_raw_weights(
+	uint32_t N, const uint8_t* pRaw_weights, 
+	vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box)
+{
+	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
+	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+		
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const float wt = (float)pRaw_weights[i] * (1.0f / 64.0f);
+		assert(wt <= 1.0f);
+
+		const float w0 = wt * wt;
+		const float w1 = (1.0f - wt) * wt;
+		const float w2 = (1.0f - wt) * (1.0f - wt);
+		const float w3 = wt;
+
+		z00 += w0;
+		z10 += w1;
+		z11 += w2;
+
+		float w = w3;
+		q00_r += w * pColors[i][0];
+		t_r += pColors[i][0];
+
+		q00_g += w * pColors[i][1];
+		t_g += pColors[i][1];
+
+		q00_b += w * pColors[i][2];
+		t_b += pColors[i][2];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+
+	z01 = z10;
+
+	float det = z00 * z11 - z01 * z10;
+	if (det == 0.0f)
+		return false;
+
+	det = 1.0f / det;
+
+	float iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	(*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r);
+	(*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+
+	(*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g);
+	(*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+
+	(*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b);
+	(*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+		float l = (*pXl)[c], h = (*pXh)[c];
+
+		if (input_box.get_dim(c) < .0000125f)
+		{
+			l = input_box[0][c];
+			h = input_box[1][c];
+		}
+
+		(*pXl)[c] = l;
+		(*pXh)[c] = h;
+	}
+
+	vec3F mean((*pXl + *pXh) * .5f);
+	vec3F dir(*pXh - *pXl);
+
+	float ln = dir.length();
+	if (ln)
+	{
+		dir /= ln;
+
+		float ld = (*pXl - mean).dot(dir);
+		float hd = (*pXh - mean).dot(dir);
+
+		aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL));
+
+		bool was_inside1 = false;
+
+		vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1);
+		if (!was_inside1)
+			*pXl = l;
+
+		bool was_inside2 = false;
+		vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2);
+		if (!was_inside2)
+			*pXh = h;
+	}
+
+	pXl->clamp(0.0f, MAX_QLOG16_VAL);
+	pXh->clamp(0.0f, MAX_QLOG16_VAL);
+
+	return true;
+}
+
+static bool compute_least_squares_endpoints_2D(
+	uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights,
+	vec2F* pXl, vec2F* pXh, const vec2F* pColors, const aabb2F& input_box)
+{
+	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
+	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel][0];
+		z10 += pSelector_weights[sel][1];
+		z11 += pSelector_weights[sel][2];
+
+		float w = pSelector_weights[sel][3];
+		q00_r += w * pColors[i][0];
+		t_r += pColors[i][0];
+
+		q00_g += w * pColors[i][1];
+		t_g += pColors[i][1];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+
+	z01 = z10;
+
+	float det = z00 * z11 - z01 * z10;
+	if (det == 0.0f)
+		return false;
+
+	det = 1.0f / det;
+
+	float iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	(*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r);
+	(*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+
+	(*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g);
+	(*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+
+	for (uint32_t c = 0; c < 2; c++)
+	{
+		float l = (*pXl)[c], h = (*pXh)[c];
+
+		if (input_box.get_dim(c) < .0000125f)
+		{
+			l = input_box[0][c];
+			h = input_box[1][c];
+		}
+
+		(*pXl)[c] = l;
+		(*pXh)[c] = h;
+	}
+		
+	pXl->clamp(0.0f, MAX_QLOG16_VAL);
+	pXh->clamp(0.0f, MAX_QLOG16_VAL);
+
+	return true;
+}
+
+static bool compute_least_squares_endpoints_1D(
+	uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights,
+	vec1F* pXl, vec1F* pXh, const vec1F* pColors, const aabb1F& input_box)
+{
+	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
+	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel][0];
+		z10 += pSelector_weights[sel][1];
+		z11 += pSelector_weights[sel][2];
+
+		float w = pSelector_weights[sel][3];
+		q00_r += w * pColors[i][0];
+		t_r += pColors[i][0];
+	}
+
+	q10_r = t_r - q00_r;
+
+	z01 = z10;
+
+	float det = z00 * z11 - z01 * z10;
+	if (det == 0.0f)
+		return false;
+
+	det = 1.0f / det;
+
+	float iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	(*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r);
+	(*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+
+	for (uint32_t c = 0; c < 1; c++)
+	{
+		float l = (*pXl)[c], h = (*pXh)[c];
+
+		if (input_box.get_dim(c) < .0000125f)
+		{
+			l = input_box[0][c];
+			h = input_box[1][c];
+		}
+
+		(*pXl)[c] = l;
+		(*pXh)[c] = h;
+	}
+
+	pXl->clamp(0.0f, MAX_QLOG16_VAL);
+	pXh->clamp(0.0f, MAX_QLOG16_VAL);
+
+	return true;
+}
+
+static bool compute_weighted_least_squares_endpoints_rgb(
+	uint32_t N, 
+	const uint8_t* pSelectors, const vec4F* pSelector_weights, const float* pRaw_weights, /* ti */
+	const float* pEmphasis_weights /* wi */,
+	vec3F* pXl, vec3F* pXh, 
+	const vec4F* pColors, /* pi */
+	const aabb3F& input_box)
+{
+	(void)input_box;
+
+	assert(N);
+	assert((pSelectors && pSelector_weights) || pRaw_weights);
+	assert(pEmphasis_weights);
+
+	// Pi = pixel colors
+	// Ti = project weights, [0,1]
+	// Wi = emphasis weights
+
+	float total_wi = 0.0f;
+	for (uint32_t i = 0; i < N; i++)
+		total_wi += pEmphasis_weights[i];
+
+	if (total_wi == 0.0f)
+		return false;
+
+	float weighted_mean_tw = 0.0f;
+	float weighted_mean_pw[3] = { 0.0f };
+
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const float wi = pEmphasis_weights[i];
+		const float ti = pSelectors ? pSelector_weights[pSelectors[i]][3] : pRaw_weights[i];
+		const float pi_r = pColors[i][0], pi_g = pColors[i][1], pi_b = pColors[i][2];
+
+		weighted_mean_tw += wi * ti;
+		
+		weighted_mean_pw[0] += wi * pi_r;
+		weighted_mean_pw[1] += wi * pi_g;
+		weighted_mean_pw[2] += wi * pi_b;
+	}
+
+	weighted_mean_tw /= total_wi;
+
+	weighted_mean_pw[0] /= total_wi;
+	weighted_mean_pw[1] /= total_wi;
+	weighted_mean_pw[2] /= total_wi;
+
+	float spt[3] = { 0.0f };
+	float stt = 0.0f;
+
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const float wi = pEmphasis_weights[i];
+		const float ti = pSelectors ? pSelector_weights[pSelectors[i]][3] : pRaw_weights[i];
+		const float pi_r = pColors[i][0], pi_g = pColors[i][1], pi_b = pColors[i][2];
+		
+		spt[0] += wi * (pi_r - weighted_mean_pw[0]) * (ti - weighted_mean_tw);
+		spt[1] += wi * (pi_g - weighted_mean_pw[1]) * (ti - weighted_mean_tw);
+		spt[2] += wi * (pi_b - weighted_mean_pw[2]) * (ti - weighted_mean_tw);
+
+		stt += wi * square(ti - weighted_mean_tw);
+	}
+
+	if (stt == 0.0f)
+		return false;
+
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		float h = weighted_mean_pw[i] + (spt[i] / stt) * (1.0f - weighted_mean_tw);
+		float l = weighted_mean_pw[i] - (spt[i] / stt) * weighted_mean_tw;
+				
+		(*pXh)[i] = h;
+		(*pXl)[i] = l;
+	}
+
+	pXl->clamp(0.0f, MAX_QLOG16_VAL);
+	pXh->clamp(0.0f, MAX_QLOG16_VAL);
+
+	return true;
+}
+
+static vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS];
+
+static uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; // [ise_range][astc_index] -> linear index
+static uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; // [ise_range][linear_index] -> astc_index
+
+static void encode_astc_hdr_init()
+{
+	// Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
+	for (uint32_t range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; range++)
+	{
+		const uint32_t num_levels = g_ise_weight_lerps[range][0];
+		assert(num_levels == astc_helpers::get_ise_levels(range));
+		assert((num_levels >= MIN_SUPPORTED_WEIGHT_LEVELS) && (num_levels <= MAX_SUPPORTED_WEIGHT_LEVELS));
+
+		for (uint32_t i = 0; i < num_levels; i++)
+		{
+			float w = g_ise_weight_lerps[range][1 + i] * (1.0f / 64.0f);
+
+			g_astc_ls_weights_ise[range][i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w);
+		}
+	}
+
+	for (uint32_t ise_range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; ise_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; ise_range++)
+	{
+		const uint32_t num_levels = g_ise_weight_lerps[ise_range][0];
+		assert((num_levels >= MIN_SUPPORTED_WEIGHT_LEVELS) && (num_levels <= MAX_SUPPORTED_WEIGHT_LEVELS));
+
+		uint32_t s[MAX_SUPPORTED_WEIGHT_LEVELS];
+		for (uint32_t i = 0; i < num_levels; i++)
+			s[i] = (g_ise_weight_lerps[ise_range][1 + i] << 8) + i;
+
+		std::sort(s, s + num_levels);
+
+		for (uint32_t i = 0; i < num_levels; i++)
+			g_map_linear_to_astc_order[ise_range][i] = (uint8_t)(s[i] & 0xFF);
+
+		for (uint32_t i = 0; i < num_levels; i++)
+			g_map_astc_to_linear_order[ise_range][g_map_linear_to_astc_order[ise_range][i]] = (uint8_t)i;
+	}
+
+	//init_quantize_tables();
+}
+
+bool g_astc_hdr_enc_initialized;
+
+void astc_hdr_enc_init()
+{
+	if (g_astc_hdr_enc_initialized)
+		return;
+
+	astc_hdr_core_init();
+
+	astc_helpers::init_tables(true);
+
+	init_qlog_tables();
+
+	encode_astc_hdr_init();
+
+	g_astc_hdr_enc_initialized = true;
+}
+
+void interpolate_qlog12_colors(
+	const int e[2][3],
+	half_float* pDecoded_half,
+	vec3F* pDecoded_float,
+	uint32_t n, uint32_t ise_weight_range)
+{
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+
+	for (uint32_t i = 0; i < 2; i++)
+	{
+		for (uint32_t j = 0; j < 3; j++)
+		{
+			assert(in_range(e[i][j], 0, 0xFFF));
+		}
+	}
+
+	for (uint32_t i = 0; i < n; i++)
+	{
+		const int c = g_ise_weight_lerps[ise_weight_range][1 + i];
+		assert(c == (int)astc_helpers::dequant_bise_weight(i, ise_weight_range));
+
+		half_float rf, gf, bf;
+
+		{
+			uint32_t r0 = e[0][0] << 4;
+			uint32_t r1 = e[1][0] << 4;
+			int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
+			rf = astc_helpers::qlog16_to_half(ri);
+		}
+
+		{
+			uint32_t g0 = e[0][1] << 4;
+			uint32_t g1 = e[1][1] << 4;
+			int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
+			gf = astc_helpers::qlog16_to_half(gi);
+		}
+
+		{
+			uint32_t b0 = e[0][2] << 4;
+			uint32_t b1 = e[1][2] << 4;
+			int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
+			bf = astc_helpers::qlog16_to_half(bi);
+		}
+
+		if (pDecoded_half)
+		{
+			pDecoded_half[i * 3 + 0] = rf;
+			pDecoded_half[i * 3 + 1] = gf;
+			pDecoded_half[i * 3 + 2] = bf;
+		}
+
+		if (pDecoded_float)
+		{
+			pDecoded_float[i][0] = half_to_float(rf);
+			pDecoded_float[i][1] = half_to_float(gf);
+			pDecoded_float[i][2] = half_to_float(bf);
+		}
+	}
+}
+
+// decoded in ASTC order, not linear order
+// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded
+bool get_astc_hdr_mode_11_block_colors(
+	const uint8_t* pEndpoints,
+	half_float* pDecoded_half,
+	vec3F* pDecoded_float,
+	uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range)
+{
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+
+	int e[2][3];
+	if (!decode_mode11_to_qlog12(pEndpoints, e, ise_endpoint_range))
+		return false;
+
+	interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range);
+
+	return true;
+}
+
+// decoded in ASTC order, not linear order
+// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded
+bool get_astc_hdr_mode_7_block_colors(
+	const uint8_t* pEndpoints,
+	half_float* pDecoded_half,
+	vec3F* pDecoded_float,
+	uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range)
+{
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+
+	int e[2][3];
+	if (!decode_mode7_to_qlog12(pEndpoints, e, nullptr, ise_endpoint_range))
+		return false;
+
+	interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range);
+
+	return true;
+}
+
+double eval_selectors_f(
+	uint32_t num_pixels,
+	uint8_t* pWeights,
+	const half_float* pBlock_pixels_half,
+	uint32_t num_weight_levels,
+	const half_float* pDecoded_half,
+	const astc_hdr_codec_base_options& coptions,
+	uint32_t usable_selector_bitmask)
+{
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+	assert(usable_selector_bitmask);
+
+	const float R_WEIGHT = coptions.m_r_err_scale;
+	const float G_WEIGHT = coptions.m_g_err_scale;
+
+	double total_error = 0;
+
+#ifdef _DEBUG
+	for (uint32_t i = 0; i < num_weight_levels; i++)
+	{
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0]));
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1]));
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2]));
+	}
+#endif
+
+	double decoded_half_q[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+
+	for (uint32_t i = 0; i < num_weight_levels; i++)
+	{
+		const half_float* p = &pDecoded_half[i * 3];
+
+		decoded_half_q[i][0] = q(p[0], coptions.m_q_log_bias);
+		decoded_half_q[i][1] = q(p[1], coptions.m_q_log_bias);
+		decoded_half_q[i][2] = q(p[2], coptions.m_q_log_bias);
+	}
+
+	for (uint32_t p = 0; p < num_pixels; p++)
+	{
+		const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
+
+		const double desired_half_r_q = q(pDesired_half[0], coptions.m_q_log_bias);
+		const double desired_half_g_q = q(pDesired_half[1], coptions.m_q_log_bias);
+		const double desired_half_b_q = q(pDesired_half[2], coptions.m_q_log_bias);
+
+		double lowest_e = BIG_FLOAT_VAL;
+
+		//double dists[MAX_SUPPORTED_WEIGHT_LEVELS];
+
+		// this is an approximation of MSLE
+		for (uint32_t i = 0; i < num_weight_levels; i++)
+		{
+			if (((1 << i) & usable_selector_bitmask) == 0)
+				continue;
+
+			// compute piecewise linear approximation of log2(a+eps)-log2(b+eps), for each component, then MSLE
+			double rd = decoded_half_q[i][0] - desired_half_r_q;
+			double gd = decoded_half_q[i][1] - desired_half_g_q;
+			double bd = decoded_half_q[i][2] - desired_half_b_q;
+
+			double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+
+			//dists[i] = e;
+
+			if (e < lowest_e)
+			{
+				lowest_e = e;
+				pWeights[p] = (uint8_t)i;
+			}
+		}
+
+		total_error += lowest_e;
+
+	} // p
+
+	return total_error;
+}
+
+double eval_selectors(
+	uint32_t num_pixels,
+	uint8_t* pWeights,
+	uint32_t ise_weight_range,
+	const half_float* pBlock_pixels_half,
+	uint32_t num_weight_levels,
+	const half_float* pDecoded_half,
+	const astc_hdr_codec_base_options& coptions,
+	uint32_t usable_selector_bitmask)
+{
+	if ((coptions.m_r_err_scale != 2.0f) || (coptions.m_g_err_scale != 3.0f))
+	{
+		return eval_selectors_f(
+			num_pixels,
+			pWeights,
+			pBlock_pixels_half,
+			num_weight_levels,
+			pDecoded_half,
+			coptions,
+			usable_selector_bitmask);
+	}
+
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+	assert(usable_selector_bitmask);
+
+	uint64_t total_error = 0;
+
+#ifdef _DEBUG
+	for (uint32_t i = 0; i < num_weight_levels; i++)
+	{
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0]));
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1]));
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2]));
+	}
+#endif
+
+	int64_t decoded_half_q[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+
+	for (uint32_t i = 0; i < num_weight_levels; i++)
+	{
+		const half_float* p = &pDecoded_half[i * 3];
+
+		decoded_half_q[i][0] = q2(p[0], coptions.m_q_log_bias);
+		decoded_half_q[i][1] = q2(p[1], coptions.m_q_log_bias);
+		decoded_half_q[i][2] = q2(p[2], coptions.m_q_log_bias);
+	}
+
+	if (usable_selector_bitmask != UINT32_MAX)
+	{
+		for (uint32_t p = 0; p < num_pixels; p++)
+		{
+			const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
+
+			const int64_t desired_half_r_q = q2(pDesired_half[0], coptions.m_q_log_bias);
+			const int64_t desired_half_g_q = q2(pDesired_half[1], coptions.m_q_log_bias);
+			const int64_t desired_half_b_q = q2(pDesired_half[2], coptions.m_q_log_bias);
+
+			int64_t lowest_e = INT64_MAX;
+
+			for (uint32_t i = 0; i < num_weight_levels; i++)
+			{
+				if (((1 << i) & usable_selector_bitmask) == 0)
+					continue;
+
+				int64_t rd = decoded_half_q[i][0] - desired_half_r_q;
+				int64_t gd = decoded_half_q[i][1] - desired_half_g_q;
+				int64_t bd = decoded_half_q[i][2] - desired_half_b_q;
+
+				int64_t e = 2 * (rd * rd) + 3 * (gd * gd) + bd * bd;
+
+				if (e < lowest_e)
+				{
+					lowest_e = e;
+					pWeights[p] = (uint8_t)i;
+				}
+			}
+
+			total_error += lowest_e;
+
+		} // p
+	}
+	else
+	{
+		if ((num_weight_levels <= 4) || (coptions.m_disable_weight_plane_optimization))
+		{
+			for (uint32_t p = 0; p < num_pixels; p++)
+			{
+				const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
+
+				const half_float desired_r = pDesired_half[0], desired_g = pDesired_half[1], desired_b = pDesired_half[2];
+
+				const int64_t desired_half_r_q = q2(desired_r, coptions.m_q_log_bias);
+				const int64_t desired_half_g_q = q2(desired_g, coptions.m_q_log_bias);
+				const int64_t desired_half_b_q = q2(desired_b, coptions.m_q_log_bias);
+
+				int64_t lowest_e = INT64_MAX;
+
+				uint32_t i;
+				for (i = 0; (i + 1) < num_weight_levels; i += 2)
+				{
+					int64_t e0, e1;
+
+					{
+						int64_t rd0 = decoded_half_q[i][0] - desired_half_r_q; // 27 bits maximum with half float inputs
+						int64_t gd0 = decoded_half_q[i][1] - desired_half_g_q;
+						int64_t bd0 = decoded_half_q[i][2] - desired_half_b_q;
+						e0 = ((2 * (rd0 * rd0) + 3 * (gd0 * gd0) + bd0 * bd0) << 5) | i; // max 62 bits (27*2+3+5)
+					}
+
+					{
+						int64_t rd1 = decoded_half_q[i + 1][0] - desired_half_r_q;
+						int64_t gd1 = decoded_half_q[i + 1][1] - desired_half_g_q;
+						int64_t bd1 = decoded_half_q[i + 1][2] - desired_half_b_q;
+						e1 = ((2 * (rd1 * rd1) + 3 * (gd1 * gd1) + bd1 * bd1) << 5) | (i + 1);
+					}
+
+					lowest_e = minimum(lowest_e, e0, e1);
+				}
+
+				if (i != num_weight_levels)
+				{
+					int64_t rd0 = decoded_half_q[i][0] - desired_half_r_q;
+					int64_t gd0 = decoded_half_q[i][1] - desired_half_g_q;
+					int64_t bd0 = decoded_half_q[i][2] - desired_half_b_q;
+					int64_t e0 = ((2 * (rd0 * rd0) + 3 * (gd0 * gd0) + bd0 * bd0) << 5) | i;
+
+					lowest_e = minimum(lowest_e, e0);
+				}
+
+				pWeights[p] = (uint8_t)(lowest_e & 31);
+
+				total_error += (lowest_e >> 5);
+
+			} // p
+		}
+		else
+		{
+			const auto& weight_val_to_ise_tab = astc_helpers::g_dequant_tables.get_weight_tab(ise_weight_range).m_val_to_ise;
+			const int lo_index = weight_val_to_ise_tab[0], hi_index = weight_val_to_ise_tab[64], mid_index = weight_val_to_ise_tab[32];
+
+			const vec3F low_color((float)pDecoded_half[lo_index * 3 + 0], (float)pDecoded_half[lo_index * 3 + 1], (float)pDecoded_half[lo_index * 3 + 2]);
+			const vec3F high_color((float)pDecoded_half[hi_index * 3 + 0], (float)pDecoded_half[hi_index * 3 + 1], (float)pDecoded_half[hi_index * 3 + 2]);
+			const vec3F mid_color((float)pDecoded_half[mid_index * 3 + 0], (float)pDecoded_half[mid_index * 3 + 1], (float)pDecoded_half[mid_index * 3 + 2]);
+						
+			const vec3F block_dir(high_color - low_color);
+
+			for (uint32_t p = 0; p < num_pixels; p++)
+			{
+				const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
+
+				const half_float desired_r = pDesired_half[0], desired_g = pDesired_half[1], desired_b = pDesired_half[2];
+
+				const int64_t desired_half_r_q = q2(desired_r, coptions.m_q_log_bias);
+				const int64_t desired_half_g_q = q2(desired_g, coptions.m_q_log_bias);
+				const int64_t desired_half_b_q = q2(desired_b, coptions.m_q_log_bias);
+				
+				// Determine which side of the middle plane the point is for a modest gain
+				vec3F c((float)desired_r - mid_color[0], (float)desired_g - mid_color[1], (float)desired_b - mid_color[2]);
+				float d = c.dot(block_dir);
+								
+				int i = 0, high_index = (num_weight_levels / 2) + 1;
+				if (d >= 0.0f)
+				{
+					i = num_weight_levels / 2;
+					high_index = num_weight_levels;
+				}
+
+				int64_t lowest_e = INT64_MAX;
+
+				for (; (i + 1) < high_index; i += 2)
+				{
+					int64_t e0, e1;
+
+					{
+						int64_t rd0 = decoded_half_q[i][0] - desired_half_r_q; // 27 bits maximum with half float inputs
+						int64_t gd0 = decoded_half_q[i][1] - desired_half_g_q;
+						int64_t bd0 = decoded_half_q[i][2] - desired_half_b_q;
+						e0 = ((2 * (rd0 * rd0) + 3 * (gd0 * gd0) + bd0 * bd0) << 5) | i; // max 62 bits (27*2+3+5)
+					}
+
+					{
+						int64_t rd1 = decoded_half_q[i + 1][0] - desired_half_r_q;
+						int64_t gd1 = decoded_half_q[i + 1][1] - desired_half_g_q;
+						int64_t bd1 = decoded_half_q[i + 1][2] - desired_half_b_q;
+						e1 = ((2 * (rd1 * rd1) + 3 * (gd1 * gd1) + bd1 * bd1) << 5) | (i + 1);
+					}
+
+					lowest_e = minimum(lowest_e, e0, e1);
+				}
+
+				if (i != high_index)
+				{
+					int64_t rd0 = decoded_half_q[i][0] - desired_half_r_q;
+					int64_t gd0 = decoded_half_q[i][1] - desired_half_g_q;
+					int64_t bd0 = decoded_half_q[i][2] - desired_half_b_q;
+					int64_t e0 = ((2 * (rd0 * rd0) + 3 * (gd0 * gd0) + bd0 * bd0) << 5) | i;
+
+					lowest_e = minimum(lowest_e, e0);
+				}
+
+				pWeights[p] = (uint8_t)(lowest_e & 31);
+
+				total_error += (lowest_e >> 5);
+
+			} // p
+		}
+	}
+
+	return (double)total_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+double eval_selectors_dual_plane(
+	uint32_t channel_index,
+	uint32_t num_pixels,
+	uint8_t* pWeights0, uint8_t* pWeights1,
+	const half_float* pBlock_pixels_half,
+	uint32_t num_weight_levels,
+	const half_float* pDecoded_half,
+	const astc_hdr_codec_base_options& coptions,
+	uint32_t usable_selector_bitmask)
+{
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+	assert(usable_selector_bitmask);
+
+	const float R_WEIGHT = coptions.m_r_err_scale;
+	const float G_WEIGHT = coptions.m_g_err_scale;
+
+	double total_error = 0;
+
+#ifdef _DEBUG
+	for (uint32_t i = 0; i < num_weight_levels; i++)
+	{
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0]));
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1]));
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2]));
+	}
+#endif
+
+	double decoded_half_q[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+
+	for (uint32_t i = 0; i < num_weight_levels; i++)
+	{
+		const half_float* p = &pDecoded_half[i * 3];
+
+		decoded_half_q[i][0] = q(p[0], coptions.m_q_log_bias);
+		decoded_half_q[i][1] = q(p[1], coptions.m_q_log_bias);
+		decoded_half_q[i][2] = q(p[2], coptions.m_q_log_bias);
+	}
+
+	const double channel_weights[3] = { R_WEIGHT, G_WEIGHT, 1.0f };
+
+	const uint32_t first_channel = (channel_index + 1) % 3;
+	const uint32_t second_channel = (channel_index + 2) % 3;
+	
+	// First plane
+	const double first_channel_weight = channel_weights[first_channel];
+	const double second_channel_weight = channel_weights[second_channel];
+		
+	for (uint32_t p = 0; p < num_pixels; p++)
+	{
+		const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
+
+		const double desired_half_x_q = q(pDesired_half[first_channel], coptions.m_q_log_bias);
+		const double desired_half_y_q = q(pDesired_half[second_channel], coptions.m_q_log_bias);
+
+		double lowest_e = BIG_FLOAT_VAL;
+
+		// this is an approximation of MSLE
+		for (uint32_t i = 0; i < num_weight_levels; i++)
+		{
+			if (((1 << i) & usable_selector_bitmask) == 0)
+				continue;
+
+			double xd = decoded_half_q[i][first_channel] - desired_half_x_q;
+			double yd = decoded_half_q[i][second_channel] - desired_half_y_q;
+
+			double e = first_channel_weight * (xd * xd) + second_channel_weight * (yd * yd);
+
+			if (e < lowest_e)
+			{
+				lowest_e = e;
+				pWeights0[p] = (uint8_t)i;
+			}
+		}
+
+		total_error += lowest_e;
+
+	} // p
+
+	// Second plane
+	const double alt_channel_weight = channel_weights[channel_index];
+
+	for (uint32_t p = 0; p < num_pixels; p++)
+	{
+		const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
+
+		const double desired_half_a_q = q(pDesired_half[channel_index], coptions.m_q_log_bias);
+		
+		double lowest_e = BIG_FLOAT_VAL;
+
+		// this is an approximation of MSLE
+		for (uint32_t i = 0; i < num_weight_levels; i++)
+		{
+			if (((1 << i) & usable_selector_bitmask) == 0)
+				continue;
+
+			double ad = decoded_half_q[i][channel_index] - desired_half_a_q;
+
+			double e = alt_channel_weight * (ad * ad);
+
+			if (e < lowest_e)
+			{
+				lowest_e = e;
+				pWeights1[p] = (uint8_t)i;
+			}
+		}
+
+		total_error += lowest_e;
+
+	} // p
+
+	return total_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+double compute_block_error(uint32_t num_pixels, const half_float* pOrig_block, const half_float* pPacked_block, const astc_hdr_codec_base_options& coptions)
+{
+	const float R_WEIGHT = coptions.m_r_err_scale;
+	const float G_WEIGHT = coptions.m_g_err_scale;
+
+	double total_error = 0;
+
+	for (uint32_t p = 0; p < num_pixels; p++)
+	{
+		double rd = q(pOrig_block[p * 3 + 0], coptions.m_q_log_bias) - q(pPacked_block[p * 3 + 0], coptions.m_q_log_bias);
+		double gd = q(pOrig_block[p * 3 + 1], coptions.m_q_log_bias) - q(pPacked_block[p * 3 + 1], coptions.m_q_log_bias);
+		double bd = q(pOrig_block[p * 3 + 2], coptions.m_q_log_bias) - q(pPacked_block[p * 3 + 2], coptions.m_q_log_bias);
+
+		double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+
+		total_error += e;
+	}
+
+	return total_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+double compute_block_error_from_raw_weights(
+	uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3],
+	const uint8_t* pRaw_weights,
+	int endpoints_qlog12[2][3],
+	const astc_hdr_codec_base_options& coptions)
+{
+	// qlog12->qlog16
+	int trial_e[2][3];
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		assert(endpoints_qlog12[0][i] <= (int)basist::MAX_QLOG12);
+		assert(endpoints_qlog12[1][i] <= (int)basist::MAX_QLOG12);
+
+		trial_e[0][i] = endpoints_qlog12[0][i] << 4;
+		trial_e[1][i] = endpoints_qlog12[1][i] << 4;
+	}
+
+	const float R_WEIGHT = coptions.m_r_err_scale, G_WEIGHT = coptions.m_g_err_scale;
+
+	double trial_error = 0;
+	for (uint32_t p = 0; p < num_pixels; p++)
+	{
+		const half_float* pDesired_half = &pBlock_pixels_half[p][0];
+
+		const double desired_half_r_q = q(pDesired_half[0], coptions.m_q_log_bias), desired_half_g_q = q(pDesired_half[1], coptions.m_q_log_bias), desired_half_b_q = q(pDesired_half[2], coptions.m_q_log_bias);
+
+		const uint32_t c = pRaw_weights[p];
+		assert(c <= 64);
+
+		{
+			half_float rf, gf, bf;
+			{
+				uint32_t r0 = trial_e[0][0], r1 = trial_e[1][0];
+				int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
+				rf = astc_helpers::qlog16_to_half(ri);
+			}
+			{
+				uint32_t g0 = trial_e[0][1], g1 = trial_e[1][1];
+				int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
+				gf = astc_helpers::qlog16_to_half(gi);
+			}
+			{
+				uint32_t b0 = trial_e[0][2], b1 = trial_e[1][2];
+				int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
+				bf = astc_helpers::qlog16_to_half(bi);
+			}
+
+			const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias);
+			const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q;
+			trial_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+		}
+	}
+
+	return trial_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static inline int compute_clamped_val(int v, int l, int h, bool& did_clamp, int& max_clamp_mag)
+{
+	assert(l < h);
+
+	if (v < l)
+	{
+		max_clamp_mag = basisu::maximum<int>(max_clamp_mag, l - v);
+
+		v = l;
+		did_clamp = true;
+	}
+	else if (v > h)
+	{
+		max_clamp_mag = basisu::maximum<int>(max_clamp_mag, v - h);
+
+		v = h;
+		did_clamp = true;
+	}
+
+	return v;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+const uint8_t s_b_bits[8] = { 7, 8, 6, 7,  8, 6, 7, 6 };
+const uint8_t s_c_bits[8] = { 6, 6, 7, 7,  6, 7, 7, 7 };
+const uint8_t s_d_bits[8] = { 7, 6, 7, 6,  5, 6, 5, 6 };
+
+// val_q[] must be already packed to qlog9-qlog12.
+bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, int val_q[2][3], int& max_clamp_mag, bool early_out_if_clamped, int max_clamp_mag_accept_thresh)
+{
+	assert(submode <= 7);
+
+	const uint32_t a_bits = 9 + (submode >> 1);
+	const uint32_t b_bits = s_b_bits[submode];
+	const uint32_t c_bits = s_c_bits[submode];
+	const uint32_t d_bits = s_d_bits[submode];
+
+	const int max_a_val = (1 << a_bits) - 1;
+	const int max_b_val = (1 << b_bits) - 1;
+	const int max_c_val = (1 << c_bits) - 1;
+
+	// The maximum usable value before it turns to NaN/Inf
+	const int max_a_qlog = get_max_qlog(a_bits);
+	BASISU_NOTE_UNUSED(max_a_qlog);
+
+	const int min_d_val = -(1 << (d_bits - 1));
+	const int max_d_val = -min_d_val - 1;
+	assert((max_d_val - min_d_val + 1) == (1 << d_bits));
+
+	int highest_q = -1, highest_val = 0, highest_comp = 0;
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+		assert(val_q[0][c] <= max_a_qlog);
+		assert(val_q[1][c] <= max_a_qlog);
+	}
+
+	for (uint32_t v = 0; v < 2; v++)
+	{
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			assert(val_q[v][c] >= 0 && val_q[v][c] <= max_a_val);
+
+			if (val_q[v][c] > highest_q)
+			{
+				highest_q = val_q[v][c];
+				highest_val = v;
+				highest_comp = c;
+			}
+		}
+	}
+
+	const bool had_tie = (val_q[highest_val ^ 1][highest_comp] == highest_q);
+
+	if (highest_val != 1)
+	{
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			std::swap(val_q[0][c], val_q[1][c]);
+		}
+	}
+
+	if (highest_comp)
+	{
+		std::swap(val_q[0][0], val_q[0][highest_comp]);
+		std::swap(val_q[1][0], val_q[1][highest_comp]);
+	}
+
+	int orig_q[2][3];
+	memcpy(orig_q, val_q, sizeof(int) * 6);
+
+	// val[1][0] is now guaranteed to be highest
+	int best_va = 0, best_vb0 = 0, best_vb1 = 0, best_vc = 0, best_vd0 = 0, best_vd1 = 0;
+	int best_max_clamp_mag = 0;
+	bool best_did_clamp = false;
+	int best_q[2][3] = { { 0, 0, 0}, { 0, 0, 0 } };
+	BASISU_NOTE_UNUSED(best_q);
+	uint32_t best_dist = UINT_MAX;
+
+	for (uint32_t pass = 0; pass < 2; pass++)
+	{
+		int trial_va = val_q[1][0];
+
+		assert(trial_va <= max_a_val);
+		assert(trial_va >= val_q[1][1]);
+		assert(trial_va >= val_q[1][2]);
+
+		assert(trial_va >= val_q[0][0]);
+		assert(trial_va >= val_q[0][1]);
+		assert(trial_va >= val_q[0][2]);
+
+		bool did_clamp = false;
+		int trial_max_clamp_mag = 0;
+
+		int trial_vb0 = compute_clamped_val(trial_va - val_q[1][1], 0, max_b_val, did_clamp, trial_max_clamp_mag);
+		int trial_vb1 = compute_clamped_val(trial_va - val_q[1][2], 0, max_b_val, did_clamp, trial_max_clamp_mag);
+		int trial_vc = compute_clamped_val(trial_va - val_q[0][0], 0, max_c_val, did_clamp, trial_max_clamp_mag);
+		int trial_vd0 = compute_clamped_val((trial_va - trial_vb0 - trial_vc) - val_q[0][1], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag);
+		int trial_vd1 = compute_clamped_val((trial_va - trial_vb1 - trial_vc) - val_q[0][2], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag);
+
+		if ((early_out_if_clamped) && (did_clamp) && (trial_max_clamp_mag > max_clamp_mag_accept_thresh))
+		{
+			if ((!had_tie) || (pass == 1))
+			{
+				max_clamp_mag = trial_max_clamp_mag;
+				return true;
+			}
+		}
+
+		if (!did_clamp)
+		{
+			// Make sure decoder gets the expected values
+			assert(trial_va == val_q[1][0]);
+			assert(trial_va - trial_vb0 == val_q[1][1]);
+			assert(trial_va - trial_vb1 == val_q[1][2]);
+
+			assert((trial_va - trial_vc) == val_q[0][0]);
+			assert((trial_va - trial_vb0 - trial_vc - trial_vd0) == val_q[0][1]);
+			assert((trial_va - trial_vb1 - trial_vc - trial_vd1) == val_q[0][2]);
+		}
+
+		const int r_e0 = clamp<int>(trial_va, 0, max_a_val);
+		const int r_e1 = clamp<int>(trial_va - trial_vb0, 0, max_a_val);
+		const int r_e2 = clamp<int>(trial_va - trial_vb1, 0, max_a_val);
+
+		const int r_f0 = clamp<int>(trial_va - trial_vc, 0, max_a_val);
+		const int r_f1 = clamp<int>(trial_va - trial_vb0 - trial_vc - trial_vd0, 0, max_a_val);
+		const int r_f2 = clamp<int>(trial_va - trial_vb1 - trial_vc - trial_vd1, 0, max_a_val);
+
+		assert(r_e0 <= max_a_qlog);
+		assert(r_e1 <= max_a_qlog);
+		assert(r_e2 <= max_a_qlog);
+
+		assert(r_f0 <= max_a_qlog);
+		assert(r_f1 <= max_a_qlog);
+		assert(r_f2 <= max_a_qlog);
+
+		if ((!did_clamp) || (!had_tie))
+		{
+			best_va = trial_va;
+			best_vb0 = trial_vb0;
+			best_vb1 = trial_vb1;
+			best_vc = trial_vc;
+			best_vd0 = trial_vd0;
+			best_vd1 = trial_vd1;
+			best_max_clamp_mag = trial_max_clamp_mag;
+			best_did_clamp = did_clamp;
+
+			best_q[1][0] = r_e0;
+			best_q[1][1] = r_e1;
+			best_q[1][2] = r_e2;
+			best_q[0][0] = r_f0;
+			best_q[0][1] = r_f1;
+			best_q[0][2] = r_f2;
+			break;
+		}
+
+		// we had a tie and it did clamp, try swapping L/H for a potential slight gain
+
+		const uint32_t r_dist1 = basisu::square<int>(r_e0 - val_q[1][0]) + basisu::square<int>(r_e1 - val_q[1][1]) + basisu::square<int>(r_e2 - val_q[1][2]);
+		const uint32_t r_dist0 = basisu::square<int>(r_f0 - val_q[0][0]) + basisu::square<int>(r_f1 - val_q[0][1]) + basisu::square<int>(r_f2 - val_q[0][2]);
+
+		const uint32_t total_dist = r_dist1 + r_dist0;
+
+		if (total_dist < best_dist)
+		{
+			best_dist = total_dist;
+
+			best_va = trial_va;
+			best_vb0 = trial_vb0;
+			best_vb1 = trial_vb1;
+			best_vc = trial_vc;
+			best_vd0 = trial_vd0;
+			best_vd1 = trial_vd1;
+			best_did_clamp = did_clamp;
+
+			best_q[1][0] = r_e0;
+			best_q[1][1] = r_e1;
+			best_q[1][2] = r_e2;
+			best_q[0][0] = r_f0;
+			best_q[0][1] = r_f1;
+			best_q[0][2] = r_f2;
+		}
+
+		for (uint32_t c = 0; c < 3; c++)
+			std::swap(val_q[0][c], val_q[1][c]);
+	}
+
+	// pack bits now
+	int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0;
+
+	int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0;
+	switch (submode)
+	{
+	case 0:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 1:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 2:
+		x0 = get_bit(best_va, 9); x1 = get_bit(best_vc, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 3:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 9); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 4:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10);
+		break;
+	case 5:
+		x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_vc, 7); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 6:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10);
+		break;
+	case 7:
+		x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	default:
+		break;
+	}
+
+	// write mode
+	pack_bit(v1, 7, submode, 0);
+	pack_bit(v2, 7, submode, 1);
+	pack_bit(v3, 7, submode, 2);
+
+	// highest component
+	pack_bit(v4, 7, highest_comp, 0);
+	pack_bit(v5, 7, highest_comp, 1);
+
+	// write bit 8 of va
+	pack_bit(v1, 6, best_va, 8);
+
+	// extra bits
+	pack_bit(v2, 6, x0);
+	pack_bit(v3, 6, x1);
+	pack_bit(v4, 6, x2);
+	pack_bit(v5, 6, x3);
+	pack_bit(v4, 5, x4);
+	pack_bit(v5, 5, x5);
+
+	v0 = best_va & 0xFF;
+	v1 |= (best_vc & 63);
+	v2 |= (best_vb0 & 63);
+	v3 |= (best_vb1 & 63);
+	v4 |= (best_vd0 & 31);
+	v5 |= (best_vd1 & 31);
+
+	assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255));
+
+	pEndpoints[0] = (uint8_t)v0;
+	pEndpoints[1] = (uint8_t)v1;
+	pEndpoints[2] = (uint8_t)v2;
+	pEndpoints[3] = (uint8_t)v3;
+	pEndpoints[4] = (uint8_t)v4;
+	pEndpoints[5] = (uint8_t)v5;
+
+#ifdef _DEBUG
+	// Test for valid pack by unpacking
+	{
+		if (highest_comp)
+		{
+			std::swap(best_q[0][0], best_q[0][highest_comp]);
+			std::swap(best_q[1][0], best_q[1][highest_comp]);
+
+			std::swap(orig_q[0][0], orig_q[0][highest_comp]);
+			std::swap(orig_q[1][0], orig_q[1][highest_comp]);
+		}
+
+		int test_e[2][3];
+		decode_mode11_to_qlog12(pEndpoints, test_e, astc_helpers::BISE_256_LEVELS);
+		for (uint32_t i = 0; i < 2; i++)
+		{
+			for (uint32_t j = 0; j < 3; j++)
+			{
+				assert(best_q[i][j] == test_e[i][j] >> (12 - a_bits));
+
+				if (!best_did_clamp)
+				{
+					assert((orig_q[i][j] == test_e[i][j] >> (12 - a_bits)) ||
+						(orig_q[1 - i][j] == test_e[i][j] >> (12 - a_bits)));
+				}
+			}
+		}
+	}
+#endif
+
+	max_clamp_mag = best_max_clamp_mag;
+
+	return best_did_clamp;
+}
+
+bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag, bool early_out_if_clamped, int max_clamp_mag_accept_thresh)
+{
+	assert(submode <= 7);
+		
+	const uint32_t a_bits = 9 + (submode >> 1);
+	const int max_a_val = (1 << a_bits) - 1;
+
+	// The maximum usable value before it turns to NaN/Inf
+	const int max_a_qlog = get_max_qlog(a_bits);
+
+	int val_q[2][3];
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+#if 0
+		// This is very slightly better, but ~8% slower likely due to the table lookups.
+		const half_float l = astc_helpers::qlog16_to_half((uint32_t)std::round(low_q16[c]));
+		val_q[0][c] = half_to_qlog7_12(l, a_bits);
+
+		const half_float h = astc_helpers::qlog16_to_half((uint32_t)std::round(high_q16[c]));
+		val_q[1][c] = half_to_qlog7_12(h, a_bits);
+#else
+		// TODO: Tune quant_qlog16() for higher precision.
+		val_q[0][c] = quant_qlog16((uint32_t)std::round(low_q16[c]), a_bits);
+		val_q[1][c] = quant_qlog16((uint32_t)std::round(high_q16[c]), a_bits);
+#endif
+
+#if 1
+		if (val_q[0][c] == val_q[1][c])
+		{
+#if 0
+			if (l <= h)
+#else
+			if (low_q16[c] < high_q16[c])
+#endif
+			{
+				if (val_q[0][c])
+					val_q[0][c]--;
+
+				if (val_q[1][c] != max_a_val)
+					val_q[1][c]++;
+			}
+			else
+			{
+				if (val_q[0][c] != max_a_val)
+					val_q[0][c]++;
+
+				if (val_q[1][c])
+					val_q[1][c]--;
+			}
+		}
+#endif
+
+		val_q[0][c] = minimum<uint32_t>(val_q[0][c], max_a_qlog);
+		val_q[1][c] = minimum<uint32_t>(val_q[1][c], max_a_qlog);
+	}
+
+	return pack_astc_mode11_submode(submode, pEndpoints, val_q, max_clamp_mag, early_out_if_clamped, max_clamp_mag_accept_thresh);
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+void pack_astc_mode11_direct(uint8_t* pEndpoints, vec3F l_q16, vec3F h_q16)
+{
+	float lg = l_q16.dot(vec3F(1.0f)), hg = h_q16.dot(vec3F(1.0f));
+	if (lg > hg)
+	{
+		// Ensure low endpoint is generally less bright than high in direct mode.
+		std::swap(l_q16, h_q16);
+	}
+
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		// TODO: This goes from QLOG16->HALF->QLOG8/7
+		half_float l_half = astc_helpers::qlog16_to_half(clamp((int)std::round(l_q16[i]), 0, 65535));
+		half_float h_half = astc_helpers::qlog16_to_half(clamp((int)std::round(h_q16[i]), 0, 65535));
+
+		int l_q, h_q;
+
+		if (i == 2)
+		{
+			l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)];
+			h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)];
+
+			l_q = minimum<uint32_t>(l_q, MAX_QLOG7);
+			h_q = minimum<uint32_t>(h_q, MAX_QLOG7);
+		}
+		else
+		{
+			l_q = g_half_to_qlog8[bounds_check((uint32_t)l_half, 0U, 32768U)];
+			h_q = g_half_to_qlog8[bounds_check((uint32_t)h_half, 0U, 32768U)];
+
+			// this quantizes R and G as 7 bits vs. 8, for grayscale.
+			//l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)] << 1;
+			//h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)] << 1;
+						
+			l_q = minimum<uint32_t>(l_q, MAX_QLOG8);
+			h_q = minimum<uint32_t>(h_q, MAX_QLOG8);
+		}
+
+#if 1
+		if (l_q == h_q)
+		{
+			const int m = (i == 2) ? MAX_QLOG7 : MAX_QLOG8;
+
+			if (l_q16[i] <= h_q16[i])
+			{
+				if (l_q)
+					l_q--;
+
+				if (h_q != m)
+					h_q++;
+			}
+			else
+			{
+				if (h_q)
+					h_q--;
+
+				if (l_q != m)
+					l_q++;
+			}
+		}
+#endif
+
+		if (i == 2)
+		{
+			assert(l_q <= (int)MAX_QLOG7 && h_q <= (int)MAX_QLOG7);
+			l_q |= 128;
+			h_q |= 128;
+		}
+		else
+		{
+			assert(l_q <= (int)MAX_QLOG8 && h_q <= (int)MAX_QLOG8);
+		}
+
+		pEndpoints[2 * i + 0] = (uint8_t)l_q;
+		pEndpoints[2 * i + 1] = (uint8_t)h_q;
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range, bool early_out_if_clamped, int max_clamp_mag_accept_thresh)
+{
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+
+	assert(submode <= 5);
+	max_clamp_mag = 0;
+
+	static const uint8_t s_r_bits[6] = { 11, 11, 10, 9, 8, 7 };
+	static const uint8_t s_g_b_bits[6] = { 5, 6, 5, 6, 7, 7 };
+	static const uint8_t s_s_bits[6] = { 7, 5, 8, 7, 6, 7 };
+
+	// The precision of the components
+	const uint32_t prec_bits = s_r_bits[submode];
+
+	int qlog[4], pack_bits[4];
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		const float f = (i == 3) ? s_q16 : rgb_q16[i];
+
+		// The # of bits the component is packed into
+		if (i == 0)
+			pack_bits[i] = s_r_bits[submode];
+		else if (i == 3)
+			pack_bits[i] = s_s_bits[submode];
+		else
+			pack_bits[i] = s_g_b_bits[submode];
+
+#if 0
+		// this is slightly worse
+		// TODO: going from qlog16 to half loses some precision. Then going from half to qlog 7-12 will have extra error.
+		half_float h = qlog_to_half(clamp((int)std::round(f), 0, MAX_QLOG16), 16);
+		qlog[i] = half_to_qlog7_12((half_float)bounds_check((uint32_t)h, 0U, 32768U), prec_bits);
+#else
+		qlog[i] = quant_qlog16(clamp<int>((int)std::round(f), 0, MAX_QLOG16), prec_bits);
+
+		// Only bias if there are enough texel weights, 4=6 weights
+		if (ise_weight_range >= 4)
+		{
+			// Explictly bias the high color, and the scale up, to better exploit the weights.
+			// The quantized range also then encompases the complete input range.
+			const uint32_t max_val = (1 << prec_bits) - 1;
+			const uint32_t K = 3;
+			if (i == 3)
+			{
+				qlog[i] = minimum<uint32_t>(qlog[i] + K * 2, max_val);
+			}
+			else
+			{
+				qlog[i] = minimum<uint32_t>(qlog[i] + K, max_val);
+			}
+		}
+#endif
+
+		if (i != 3)
+			qlog[i] = minimum<uint32_t>(qlog[i], get_max_qlog(prec_bits));
+
+		// If S=0, we lose freedom for the texel weights to add any value.
+		if ((i == 3) && (qlog[i] == 0))
+			qlog[i] = 1;
+	}
+
+	uint32_t maj_index = 0;
+
+	bool did_clamp = false;
+
+	if (submode != 5)
+	{
+		int largest_qlog = 0;
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			if (qlog[i] > largest_qlog)
+			{
+				largest_qlog = qlog[i];
+				maj_index = i;
+			}
+		}
+
+		if (maj_index)
+		{
+			std::swap(qlog[0], qlog[maj_index]);
+		}
+
+		assert(qlog[0] >= qlog[1]);
+		assert(qlog[0] >= qlog[2]);
+
+		qlog[1] = qlog[0] - qlog[1];
+		qlog[2] = qlog[0] - qlog[2];
+
+		for (uint32_t i = 1; i < 4; i++)
+		{
+			const int max_val = (1 << pack_bits[i]) - 1;
+
+			if (qlog[i] > max_val)
+			{
+				max_clamp_mag = maximum<int>(max_clamp_mag, qlog[i] - max_val);
+				qlog[i] = max_val;
+				did_clamp = true;
+
+				if ((early_out_if_clamped) && (max_clamp_mag > max_clamp_mag_accept_thresh))
+					return true;
+			}
+		}
+	}
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		const int max_val = (1 << pack_bits[i]) - 1; (void)max_val;
+
+		assert(qlog[i] <= max_val);
+	}
+
+	int mode = 0;
+
+	int r = qlog[0] & 63; // 6-bits
+	int g = qlog[1] & 31; // 5-bits
+	int b = qlog[2] & 31; // 5-bits
+	int s = qlog[3] & 31; // 5-bits
+
+	int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0;
+
+	switch (submode)
+	{
+	case 0:
+	{
+		mode = (maj_index << 2) | 0;
+		assert((mode & 0xC) != 0xC);
+
+		x0 = get_bit(qlog[0], 9); // R9
+		x1 = get_bit(qlog[0], 8); // R8
+		x2 = get_bit(qlog[0], 7); // R7
+		x3 = get_bit(qlog[0], 10); // R10
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[3], 6); // S6
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	case 1:
+	{
+		mode = (maj_index << 2) | 1;
+		assert((mode & 0xC) != 0xC);
+
+		x0 = get_bit(qlog[0], 8); // R8
+		x1 = get_bit(qlog[1], 5); // G5
+		x2 = get_bit(qlog[0], 7); // R7
+		x3 = get_bit(qlog[2], 5); // B5
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[0], 10); // R10
+		x6 = get_bit(qlog[0], 9); // R9
+		break;
+	}
+	case 2:
+	{
+		mode = (maj_index << 2) | 2;
+		assert((mode & 0xC) != 0xC);
+
+		x0 = get_bit(qlog[0], 9); // R9
+		x1 = get_bit(qlog[0], 8); // R8
+		x2 = get_bit(qlog[0], 7); // R7
+		x3 = get_bit(qlog[0], 6); // R6
+		x4 = get_bit(qlog[3], 7); // S7 
+		x5 = get_bit(qlog[3], 6); // S6
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	case 3:
+	{
+		mode = (maj_index << 2) | 3;
+		assert((mode & 0xC) != 0xC);
+
+		x0 = get_bit(qlog[0], 8); // R8
+		x1 = get_bit(qlog[1], 5); // G5
+		x2 = get_bit(qlog[0], 7); // R7
+		x3 = get_bit(qlog[2], 5); // B5
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[3], 6); // S6
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	case 4:
+	{
+		mode = maj_index | 0xC; // 0b1100
+		assert((mode & 0xC) == 0xC);
+		assert(mode != 0xF);
+
+		x0 = get_bit(qlog[1], 6); // G6
+		x1 = get_bit(qlog[1], 5); // G5
+		x2 = get_bit(qlog[2], 6); // B6
+		x3 = get_bit(qlog[2], 5); // B5
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[0], 7); // R7
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	case 5:
+	{
+		mode = 0xF;
+
+		x0 = get_bit(qlog[1], 6); // G6
+		x1 = get_bit(qlog[1], 5); // G5
+		x2 = get_bit(qlog[2], 6); // B6
+		x3 = get_bit(qlog[2], 5); // B5
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[3], 6); // S6
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	default:
+	{
+		assert(0);
+		break;
+	}
+	}
+
+	pEndpoints[0] = (uint8_t)((get_bit(mode, 1) << 7) | (get_bit(mode, 0) << 6) | r);
+	pEndpoints[1] = (uint8_t)((get_bit(mode, 2) << 7) | (x0 << 6) | (x1 << 5) | g);
+	pEndpoints[2] = (uint8_t)((get_bit(mode, 3) << 7) | (x2 << 6) | (x3 << 5) | b);
+	pEndpoints[3] = (uint8_t)((x4 << 7) | (x5 << 6) | (x6 << 5) | s);
+
+#ifdef _DEBUG
+	// Test for valid pack by unpacking
+	{
+		const int inv_shift = 12 - prec_bits;
+
+		int unpacked_e[2][3];
+		if (submode != 5)
+		{
+			unpacked_e[1][0] = left_shift32(qlog[0], inv_shift);
+			unpacked_e[1][1] = clamp(left_shift32((qlog[0] - qlog[1]), inv_shift), 0, 0xFFF);
+			unpacked_e[1][2] = clamp(left_shift32((qlog[0] - qlog[2]), inv_shift), 0, 0xFFF);
+
+			unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF);
+			unpacked_e[0][1] = clamp(left_shift32(((qlog[0] - qlog[1]) - qlog[3]), inv_shift), 0, 0xFFF);
+			unpacked_e[0][2] = clamp(left_shift32(((qlog[0] - qlog[2]) - qlog[3]), inv_shift), 0, 0xFFF);
+		}
+		else
+		{
+			unpacked_e[1][0] = left_shift32(qlog[0], inv_shift);
+			unpacked_e[1][1] = left_shift32(qlog[1], inv_shift);
+			unpacked_e[1][2] = left_shift32(qlog[2], inv_shift);
+
+			unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF);
+			unpacked_e[0][1] = clamp(left_shift32((qlog[1] - qlog[3]), inv_shift), 0, 0xFFF);
+			unpacked_e[0][2] = clamp(left_shift32((qlog[2] - qlog[3]), inv_shift), 0, 0xFFF);
+		}
+
+		if (maj_index)
+		{
+			std::swap(unpacked_e[0][0], unpacked_e[0][maj_index]);
+			std::swap(unpacked_e[1][0], unpacked_e[1][maj_index]);
+		}
+
+		int e[2][3];
+		decode_mode7_to_qlog12_ise20(pEndpoints, e, nullptr);
+
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			assert(unpacked_e[0][i] == e[0][i]);
+			assert(unpacked_e[1][i] == e[1][i]);
+		}
+	}
+#endif
+
+	return did_clamp;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+bool pack_mode11(mode11_log_desc& desc, uint8_t* pEndpoints)
+{
+	memset(pEndpoints, 0, NUM_MODE11_ENDPOINTS);
+
+	if (desc.is_direct())
+	{
+		if ((desc.m_a < 0) || (desc.m_c < 0) || (desc.m_b0 < 0))
+			return false;
+
+		if (!((desc.m_a <= 255) && (desc.m_c <= 255) && (desc.m_b0 <= 127)))
+			return false;
+
+		pEndpoints[0] = (uint8_t)desc.m_a;
+		pEndpoints[2] = (uint8_t)desc.m_c;
+		pEndpoints[4] = (uint8_t)desc.m_b0 | 128;
+
+		if ((desc.m_b1 < 0) || (desc.m_d0 < 0) || (desc.m_d1 < 0))
+			return false;
+
+		if (!((desc.m_b1 <= 255) && (desc.m_d0 <= 255) && (desc.m_d1 <= 127)))
+			return false;
+
+		pEndpoints[1] = (uint8_t)desc.m_b1;
+		pEndpoints[3] = (uint8_t)desc.m_d0;
+		pEndpoints[5] = (uint8_t)desc.m_d1 | 128;
+		
+		return true;
+	}
+
+	if (!((desc.m_a >= 0) && (desc.m_a <= desc.m_max_a_val)))
+		return false;
+	if (!(((desc.m_c >= 0) && (desc.m_c <= desc.m_max_c_val))))
+		return false;
+	if (!((desc.m_b0 >= 0) && (desc.m_b0 <= desc.m_max_b_val)))
+		return false;
+	if (!((desc.m_b1 >= 0) && (desc.m_b1 <= desc.m_max_b_val)))
+		return false;
+	if (!((desc.m_d0 >= desc.m_min_d_val) && (desc.m_d0 <= desc.m_max_d_val)))
+		return false;
+	if (!((desc.m_d1 >= desc.m_min_d_val) && (desc.m_d1 <= desc.m_max_d_val)))
+		return false;
+
+	const int va = desc.m_a, vb0 = desc.m_b0, vb1 = desc.m_b1, vc = desc.m_c, vd0 = desc.m_d0, vd1 = desc.m_d1;
+	
+	int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0;
+	
+	int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0;
+	switch (desc.m_submode)
+	{
+	case 0:
+		x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(vd0, 6); x3 = get_bit(vd1, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5);
+		break;
+	case 1:
+		x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(vb0, 7); x3 = get_bit(vb1, 7); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5);
+		break;
+	case 2:
+		x0 = get_bit(va, 9); x1 = get_bit(vc, 6); x2 = get_bit(vd0, 6); x3 = get_bit(vd1, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5);
+		break;
+	case 3:
+		x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(va, 9); x3 = get_bit(vc, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5);
+		break;
+	case 4:
+		x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(vb0, 7); x3 = get_bit(vb1, 7); x4 = get_bit(va, 9); x5 = get_bit(va, 10);
+		break;
+	case 5:
+		x0 = get_bit(va, 9); x1 = get_bit(va, 10); x2 = get_bit(vc, 7); x3 = get_bit(vc, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5);
+		break;
+	case 6:
+		x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(va, 11); x3 = get_bit(vc, 6); x4 = get_bit(va, 9); x5 = get_bit(va, 10);
+		break;
+	case 7:
+		x0 = get_bit(va, 9); x1 = get_bit(va, 10); x2 = get_bit(va, 11); x3 = get_bit(vc, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5);
+		break;
+	default:
+		break;
+	}
+
+	// write mode
+	pack_bit(v1, 7, desc.m_submode, 0);
+	pack_bit(v2, 7, desc.m_submode, 1);
+	pack_bit(v3, 7, desc.m_submode, 2);
+
+	// highest component
+	pack_bit(v4, 7, desc.m_maj_comp, 0);
+	pack_bit(v5, 7, desc.m_maj_comp, 1);
+
+	// write bit 8 of va
+	pack_bit(v1, 6, va, 8);
+
+	// extra bits
+	pack_bit(v2, 6, x0);
+	pack_bit(v3, 6, x1);
+	pack_bit(v4, 6, x2);
+	pack_bit(v5, 6, x3);
+	pack_bit(v4, 5, x4);
+	pack_bit(v5, 5, x5);
+
+	v0 = va & 0xFF;
+	v1 |= (vc & 63);
+	v2 |= (vb0 & 63);
+	v3 |= (vb1 & 63);
+	v4 |= (vd0 & 31);
+	v5 |= (vd1 & 31);
+
+	assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255));
+
+	pEndpoints[0] = (uint8_t)v0;
+	pEndpoints[1] = (uint8_t)v1;
+	pEndpoints[2] = (uint8_t)v2;
+	pEndpoints[3] = (uint8_t)v3;
+	pEndpoints[4] = (uint8_t)v4;
+	pEndpoints[5] = (uint8_t)v5;
+
+	return true;
+}
+
+static inline int astc_hdr_sign_extend(int src, int num_src_bits)
+{
+	assert(basisu::in_range(num_src_bits, 2, 31));
+
+	const bool negative = (src & (1 << (num_src_bits - 1))) != 0;
+	if (negative)
+		return src | ~((1 << num_src_bits) - 1);
+	else
+		return src & ((1 << num_src_bits) - 1);
+}
+
+void unpack_mode11(const uint8_t* pEndpoints, mode11_log_desc& desc)
+{
+	clear_obj(desc);
+
+	pack_bit(desc.m_maj_comp, 0, pEndpoints[4], 7);
+	pack_bit(desc.m_maj_comp, 1, pEndpoints[5], 7);
+
+	if (desc.m_maj_comp == 3)
+	{
+		desc.m_a = pEndpoints[0];
+		desc.m_c = pEndpoints[2];
+		desc.m_b0 = pEndpoints[4] & 0x7F;
+
+		desc.m_b1 = pEndpoints[1];
+		desc.m_d0 = pEndpoints[3];
+		desc.m_d1 = pEndpoints[5] & 0x7F;
+		
+		return;
+	}
+
+	pack_bit(desc.m_submode, 0, pEndpoints[1], 7);
+	pack_bit(desc.m_submode, 1, pEndpoints[2], 7);
+	pack_bit(desc.m_submode, 2, pEndpoints[3], 7);
+
+	desc.m_a = pEndpoints[0];		// 8 bits
+	pack_bit(desc.m_a, 8, pEndpoints[1], 6);
+
+	desc.m_c = pEndpoints[1] & 63;	// 6 bits
+	desc.m_b0 = pEndpoints[2] & 63; // 6 bits
+	desc.m_b1 = pEndpoints[3] & 63; // 6 bits
+	desc.m_d0 = pEndpoints[4] & 31; // 5 bits
+	desc.m_d1 = pEndpoints[5] & 31; // 5 bits
+
+	const int x0 = get_bit(pEndpoints[2], 6);
+	const int x1 = get_bit(pEndpoints[3], 6);
+	const int x2 = get_bit(pEndpoints[4], 6);
+	const int x3 = get_bit(pEndpoints[5], 6);
+	const int x4 = get_bit(pEndpoints[4], 5);
+	const int x5 = get_bit(pEndpoints[5], 5);
+
+	switch (desc.m_submode)
+	{
+	case 0:
+		pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_d0, 6, x2, 0); pack_bit(desc.m_d1, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0);
+		break;
+	case 1:
+		pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_b0, 7, x2, 0); pack_bit(desc.m_b1, 7, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0);
+		break;
+	case 2:
+		pack_bit(desc.m_a, 9, x0, 0); pack_bit(desc.m_c, 6, x1, 0); pack_bit(desc.m_d0, 6, x2, 0); pack_bit(desc.m_d1, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0);
+		break;
+	case 3:
+		pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_a, 9, x2, 0); pack_bit(desc.m_c, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0);
+		break;
+	case 4:
+		pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_b0, 7, x2, 0); pack_bit(desc.m_b1, 7, x3, 0); pack_bit(desc.m_a, 9, x4, 0); pack_bit(desc.m_a, 10, x5, 0);
+		break;
+	case 5:
+		pack_bit(desc.m_a, 9, x0, 0); pack_bit(desc.m_a, 10, x1, 0); pack_bit(desc.m_c, 7, x2, 0); pack_bit(desc.m_c, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0);
+		break;
+	case 6:
+		pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_a, 11, x2, 0); pack_bit(desc.m_c, 6, x3, 0); pack_bit(desc.m_a, 9, x4, 0); pack_bit(desc.m_a, 10, x5, 0);
+		break;
+	case 7:
+	default:
+		pack_bit(desc.m_a, 9, x0, 0); pack_bit(desc.m_a, 10, x1, 0); pack_bit(desc.m_a, 11, x2, 0); pack_bit(desc.m_c, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0);
+		break;
+	}
+
+	desc.m_a_bits = 9 + (desc.m_submode >> 1);
+	desc.m_b_bits = s_b_bits[desc.m_submode];
+	desc.m_c_bits = s_c_bits[desc.m_submode];
+	desc.m_d_bits = s_d_bits[desc.m_submode];
+
+	desc.m_max_a_val = (1 << desc.m_a_bits) - 1;
+	desc.m_max_b_val = (1 << desc.m_b_bits) - 1;
+	desc.m_max_c_val = (1 << desc.m_c_bits) - 1;
+
+	desc.m_min_d_val = -(1 << (desc.m_d_bits - 1));
+	desc.m_max_d_val = -desc.m_min_d_val - 1;
+
+	desc.m_d0 = astc_hdr_sign_extend(desc.m_d0, desc.m_d_bits);
+	desc.m_d1 = astc_hdr_sign_extend(desc.m_d1, desc.m_d_bits);
+
+	assert((desc.m_a >= 0) && (desc.m_a <= desc.m_max_a_val));
+	assert((desc.m_c >= 0) && (desc.m_c <= desc.m_max_c_val));
+	assert((desc.m_b0 >= 0) && (desc.m_b0 <= desc.m_max_b_val));
+	assert((desc.m_b1 >= 0) && (desc.m_b1 <= desc.m_max_b_val));
+	assert((desc.m_d0 >= desc.m_min_d_val) && (desc.m_d0 <= desc.m_max_d_val));
+	assert((desc.m_d1 >= desc.m_min_d_val) && (desc.m_d1 <= desc.m_max_d_val));
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+void decode_cem_11_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index)
+{
+	submode_index = 0;
+	maj_index = 0;
+
+	pack_bit(submode_index, 0, pEndpoints[1], 7);
+	pack_bit(submode_index, 1, pEndpoints[2], 7);
+	pack_bit(submode_index, 2, pEndpoints[3], 7);
+
+	pack_bit(maj_index, 0, pEndpoints[4], 7);
+	pack_bit(maj_index, 1, pEndpoints[5], 7);
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+void decode_cem_7_config(const uint8_t* pEndpoints, int& submode_index, int &maj_index)
+{
+	const int v0 = pEndpoints[0], v1 = pEndpoints[1], v2 = pEndpoints[2], v3 = pEndpoints[3];
+	(void)v3;
+
+	// Extract mode bits and unpack to major component and mode.
+	const int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4);
+
+	if ((modeval & 0xC) != 0xC)
+	{
+		maj_index = modeval >> 2;
+		submode_index = modeval & 3;
+	}
+	else if (modeval != 0xF)
+	{
+		maj_index = modeval & 3;
+		submode_index = 4;
+	}
+	else
+	{
+		maj_index = 0;
+		submode_index = 5;
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+// TODO: Use pack_mode11() as a shared function.
+
+bool pack_mode11(
+	const vec3F& low_color_q16, const vec3F& high_color_q16,
+	uint32_t ise_endpoint_range, uint8_t* pEndpoints, 
+	const astc_hdr_codec_base_options& coptions,
+	bool direct_only, int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used)
+{
+	uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS];
+
+	if (direct_only)
+	{
+		first_submode = -1;
+		last_submode = -1;
+	}
+
+	assert(first_submode <= last_submode);
+	assert((first_submode >= -1) && (first_submode <= 7));
+	assert((last_submode >= -1) && (last_submode <= 7));
+
+	memset(pEndpoints, 0, NUM_MODE11_ENDPOINTS);
+
+	double best_trial_dist = BIG_FLOAT_VAL;
+	int best_submode = 0;
+
+	for (int submode = last_submode; submode >= first_submode; submode--)
+	{
+		bool did_clamp = false;
+		int max_clamp_mag = 0;
+		if (submode == -1)
+		{
+			// If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision.
+			pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16);
+		}
+		else
+		{
+			const int MAX_CLAMP_MAG_ACCEPT_THRESH = 32;
+			did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag, !ignore_clamping, MAX_CLAMP_MAG_ACCEPT_THRESH);
+
+			if (!ignore_clamping)
+			{
+				// If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts.
+				if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
+					continue;
+			}
+		}
+
+		uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS];
+
+		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
+		// It could massively distort the endpoints, but still result in a valid encoding.
+		basist::astc_6x6_hdr::requantize_ise_endpoints(11, astc_helpers::BISE_256_LEVELS, orig_trial_endpoints, ise_endpoint_range, trial_endpoints);
+
+		int e[2][3];
+		if (!decode_mode11_to_qlog12(trial_endpoints, e, ise_endpoint_range))
+			continue;
+
+		vec3F e0(
+			(float)(e[0][0] << 4),
+			(float)(e[0][1] << 4),
+			(float)(e[0][2] << 4)
+		);
+
+		vec3F e1(
+			(float)(e[1][0] << 4),
+			(float)(e[1][1] << 4),
+			(float)(e[1][2] << 4)
+		);
+
+		double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16);
+		double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16);
+		double dist = helpers::minimum(dist0, dist1);
+
+		if (dist < best_trial_dist)
+		{
+			best_trial_dist = dist;
+			best_submode = submode;
+			memcpy(pEndpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
+		}
+
+		if (coptions.m_take_first_non_clamping_mode11_submode)
+		{
+			if (!did_clamp)
+				break;
+		}
+
+	} // submode
+
+	if ((coptions.m_ultra_quant) &&
+		(ise_endpoint_range < astc_helpers::BISE_256_LEVELS) &&
+		(best_trial_dist != BIG_FLOAT_VAL))
+	{
+		uint8_t orig_best_trial_endpoints[NUM_MODE11_ENDPOINTS];
+		memcpy(orig_best_trial_endpoints, pEndpoints, NUM_MODE11_ENDPOINTS);
+
+		for (uint32_t c = 0; c < NUM_MODE11_ENDPOINTS; c++)
+		{
+			for (int dt = 0; dt <= 1; dt++)
+			{
+				const int d = dt ? 1 : -1;
+
+				uint8_t varied_endpoints[NUM_MODE11_ENDPOINTS];
+				memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE11_ENDPOINTS);
+
+				int ise = varied_endpoints[c];
+
+				int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise];
+				rank = clamp<int>(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1);
+
+				ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank];
+
+				varied_endpoints[c] = (uint8_t)ise;
+
+				int e[2][3];
+				if (!decode_mode11_to_qlog12(varied_endpoints, e, ise_endpoint_range))
+					continue;
+
+				vec3F e0(
+					(float)(e[0][0] << 4),
+					(float)(e[0][1] << 4),
+					(float)(e[0][2] << 4)
+				);
+
+				vec3F e1(
+					(float)(e[1][0] << 4),
+					(float)(e[1][1] << 4),
+					(float)(e[1][2] << 4)
+				);
+
+				double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16);
+				double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16);
+				double dist = helpers::minimum(dist0, dist1);
+
+				if (dist < best_trial_dist)
+				{
+					best_trial_dist = dist;
+					memcpy(pEndpoints, varied_endpoints, NUM_MODE11_ENDPOINTS);
+				}
+			} // d
+		} // c
+	} // if (coptions.m_ultra_quant)
+		
+	submode_used = best_submode + 1;
+
+	return (best_trial_dist != BIG_FLOAT_VAL);
+}
+
+bool try_mode11(uint32_t num_pixels,
+	uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
+	const vec3F& low_color_q16, const vec3F& high_color_q16,
+	const basist::half_float block_pixels_half[][3],
+	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
+	bool constrain_ise_weight_selectors,
+	int32_t first_submode, int32_t last_submode, bool ignore_clamping) // -1, 7
+{
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((num_weight_levels >= MIN_SUPPORTED_WEIGHT_LEVELS) && (num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS));
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+	assert(num_weight_levels == astc_helpers::get_ise_levels(ise_weight_range));
+
+	half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+	uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+
+	if (direct_only)
+	{
+		first_submode = -1;
+		last_submode = -1;
+	}
+
+	assert(first_submode <= last_submode);
+	assert((first_submode >= -1) && (first_submode <= 7));
+	assert((last_submode >= -1) && (last_submode <= 7));
+
+	uint8_t best_trial_endpoints[NUM_MODE11_ENDPOINTS];
+	clear_obj(best_trial_endpoints);
+	double best_trial_dist = BIG_FLOAT_VAL;
+	int best_submode = 0;
+
+	for (int submode = last_submode; submode >= first_submode; submode--)
+	{
+		bool did_clamp = false;
+		int max_clamp_mag = 0;
+		if (submode == -1)
+		{
+			// If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision.
+			pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16);
+		}
+		else
+		{
+			const int MAX_CLAMP_MAG_ACCEPT_THRESH = 32;
+			did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag, !ignore_clamping, MAX_CLAMP_MAG_ACCEPT_THRESH);
+
+			if (!ignore_clamping)
+			{
+				// If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts.
+				if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
+					continue;
+			}
+		}
+
+		uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS];
+
+		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
+		// It could massively distort the endpoints, but still result in a valid encoding.
+		basist::astc_6x6_hdr::requantize_ise_endpoints(11, astc_helpers::BISE_256_LEVELS, orig_trial_endpoints, ise_endpoint_range, trial_endpoints);
+
+		int e[2][3];
+		if (!decode_mode11_to_qlog12(trial_endpoints, e, ise_endpoint_range))
+			continue;
+
+		vec3F e0(
+			(float)(e[0][0] << 4),
+			(float)(e[0][1] << 4),
+			(float)(e[0][2] << 4)
+		);
+
+		vec3F e1(
+			(float)(e[1][0] << 4),
+			(float)(e[1][1] << 4),
+			(float)(e[1][2] << 4)
+		);
+
+		double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16);
+		double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16);
+		double dist = helpers::minimum(dist0, dist1);
+
+		if (dist < best_trial_dist)
+		{
+			best_trial_dist = dist;
+			best_submode = submode;
+			memcpy(best_trial_endpoints, trial_endpoints, sizeof(best_trial_endpoints));
+		}
+
+		if (coptions.m_take_first_non_clamping_mode11_submode)
+		{
+			if (!did_clamp)
+				break;
+		}
+
+	} // submode
+
+	if ((coptions.m_ultra_quant) &&
+		(ise_endpoint_range < astc_helpers::BISE_256_LEVELS) &&
+		(best_trial_dist != BIG_FLOAT_VAL))
+	{
+		uint8_t orig_best_trial_endpoints[NUM_MODE11_ENDPOINTS];
+		memcpy(orig_best_trial_endpoints, best_trial_endpoints, NUM_MODE11_ENDPOINTS);
+
+		for (uint32_t c = 0; c < NUM_MODE11_ENDPOINTS; c++)
+		{
+			for (int dt = 0; dt <= 1; dt++)
+			{
+				const int d = dt ? 1 : -1;
+
+				uint8_t varied_endpoints[NUM_MODE11_ENDPOINTS];
+				memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE11_ENDPOINTS);
+
+				int ise = varied_endpoints[c];
+
+				int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise];
+				rank = clamp<int>(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1);
+
+				ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank];
+
+				varied_endpoints[c] = (uint8_t)ise;
+
+				int e[2][3];
+				if (!decode_mode11_to_qlog12(varied_endpoints, e, ise_endpoint_range))
+					continue;
+
+				vec3F e0(
+					(float)(e[0][0] << 4),
+					(float)(e[0][1] << 4),
+					(float)(e[0][2] << 4)
+				);
+
+				vec3F e1(
+					(float)(e[1][0] << 4),
+					(float)(e[1][1] << 4),
+					(float)(e[1][2] << 4)
+				);
+
+				double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16);
+				double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16);
+				double dist = helpers::minimum(dist0, dist1);
+
+				if (dist < best_trial_dist)
+				{
+					best_trial_dist = dist;
+					memcpy(best_trial_endpoints, varied_endpoints, NUM_MODE11_ENDPOINTS);
+				}
+			} // d
+		} // c
+	} // if (coptions.m_ultra_quant)
+
+	bool improved_flag = false;
+
+	if (best_trial_dist != BIG_FLOAT_VAL)
+	{
+		if (get_astc_hdr_mode_11_block_colors(best_trial_endpoints, &decoded_half[0][0], nullptr, num_weight_levels, ise_weight_range, ise_endpoint_range))
+		{
+			uint32_t usable_selector_bitmask = UINT32_MAX;
+			if ((constrain_ise_weight_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS))
+				usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15);
+			else if ((constrain_ise_weight_selectors) && (ise_weight_range == astc_helpers::BISE_12_LEVELS))
+				usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3);
+
+			double trial_blk_error = eval_selectors(num_pixels, trial_weights, ise_weight_range, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask);
+			if (trial_blk_error < cur_block_error)
+			{
+				cur_block_error = trial_blk_error;
+				memcpy(pEndpoints, best_trial_endpoints, NUM_MODE11_ENDPOINTS);
+				memcpy(pWeights, trial_weights, num_pixels);
+				submode_used = best_submode + 1;
+				improved_flag = true;
+			}
+		}
+	}
+
+	return improved_flag;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+bool try_mode11_dual_plane(uint32_t channel_index, uint32_t num_pixels,
+	uint8_t* pEndpoints, uint8_t* pWeights0, uint8_t* pWeights1, double& cur_block_error, uint32_t& submode_used,
+	const vec3F& low_color_q16, const vec3F& high_color_q16,
+	const basist::half_float block_pixels_half[][3],
+	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
+	bool constrain_ise_weight_selectors,
+	int32_t first_submode, int32_t last_submode, bool ignore_clamping) // -1, 7
+{
+	assert(channel_index <= 2);
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((num_weight_levels >= MIN_SUPPORTED_WEIGHT_LEVELS) && (num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS));
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+	assert(num_weight_levels == astc_helpers::get_ise_levels(ise_weight_range));
+
+	half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+	uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights0[MAX_ASTC_HDR_ENC_BLOCK_PIXELS], trial_weights1[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+
+	if (direct_only)
+	{
+		first_submode = -1;
+		last_submode = -1;
+	}
+
+	assert(first_submode <= last_submode);
+	assert((first_submode >= -1) && (first_submode <= 7));
+	assert((last_submode >= -1) && (last_submode <= 7));
+
+	uint8_t best_trial_endpoints[NUM_MODE11_ENDPOINTS];
+	clear_obj(best_trial_endpoints);
+
+	double best_trial_dist = BIG_FLOAT_VAL;
+	int best_submode = 0;
+
+	for (int submode = last_submode; submode >= first_submode; submode--)
+	{
+		bool did_clamp = false;
+		int max_clamp_mag = 0;
+		if (submode == -1)
+		{
+			// If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision.
+			pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16);
+		}
+		else
+		{
+			const int MAX_CLAMP_MAG_ACCEPT_THRESH = 32;
+			did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag, !ignore_clamping, MAX_CLAMP_MAG_ACCEPT_THRESH);
+
+			if (!ignore_clamping)
+			{
+				// If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts.
+				if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
+					continue;
+			}
+		}
+
+		uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS];
+
+		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
+		// It could massively distort the endpoints, but still result in a valid encoding.
+		basist::astc_6x6_hdr::requantize_ise_endpoints(11, astc_helpers::BISE_256_LEVELS, orig_trial_endpoints, ise_endpoint_range, trial_endpoints);
+
+		int e[2][3];
+		if (!decode_mode11_to_qlog12(trial_endpoints, e, ise_endpoint_range))
+			continue;
+
+		vec3F e0(
+			(float)(e[0][0] << 4),
+			(float)(e[0][1] << 4),
+			(float)(e[0][2] << 4)
+		);
+
+		vec3F e1(
+			(float)(e[1][0] << 4),
+			(float)(e[1][1] << 4),
+			(float)(e[1][2] << 4)
+		);
+
+		double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16);
+		double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16);
+		double dist = helpers::minimum(dist0, dist1);
+
+		if (dist < best_trial_dist)
+		{
+			best_trial_dist = dist;
+			best_submode = submode;
+			memcpy(best_trial_endpoints, trial_endpoints, sizeof(best_trial_endpoints));
+		}
+
+		if (coptions.m_take_first_non_clamping_mode11_submode)
+		{
+			if (!did_clamp)
+				break;
+		}
+
+	} // submode
+
+	if ((coptions.m_ultra_quant) &&
+		(ise_endpoint_range < astc_helpers::BISE_256_LEVELS) &&
+		(best_trial_dist != BIG_FLOAT_VAL))
+	{
+		uint8_t orig_best_trial_endpoints[NUM_MODE11_ENDPOINTS];
+		memcpy(orig_best_trial_endpoints, best_trial_endpoints, NUM_MODE11_ENDPOINTS);
+
+		for (uint32_t c = 0; c < NUM_MODE11_ENDPOINTS; c++)
+		{
+			for (int dt = 0; dt <= 1; dt++)
+			{
+				const int d = dt ? 1 : -1;
+
+				uint8_t varied_endpoints[NUM_MODE11_ENDPOINTS];
+				memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE11_ENDPOINTS);
+
+				int ise = varied_endpoints[c];
+
+				int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise];
+				rank = clamp<int>(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1);
+
+				ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank];
+
+				varied_endpoints[c] = (uint8_t)ise;
+
+				int e[2][3];
+				if (!decode_mode11_to_qlog12(varied_endpoints, e, ise_endpoint_range))
+					continue;
+
+				vec3F e0(
+					(float)(e[0][0] << 4),
+					(float)(e[0][1] << 4),
+					(float)(e[0][2] << 4)
+				);
+
+				vec3F e1(
+					(float)(e[1][0] << 4),
+					(float)(e[1][1] << 4),
+					(float)(e[1][2] << 4)
+				);
+
+				double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16);
+				double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16);
+				double dist = helpers::minimum(dist0, dist1);
+
+				if (dist < best_trial_dist)
+				{
+					best_trial_dist = dist;
+					memcpy(best_trial_endpoints, varied_endpoints, NUM_MODE11_ENDPOINTS);
+				}
+			} // d
+		} // c
+	} // if (coptions.m_ultra_quant)
+
+	bool improved_flag = false;
+
+	if (best_trial_dist != BIG_FLOAT_VAL)
+	{
+		if (get_astc_hdr_mode_11_block_colors(best_trial_endpoints, &decoded_half[0][0], nullptr, num_weight_levels, ise_weight_range, ise_endpoint_range))
+		{
+			uint32_t usable_selector_bitmask = UINT32_MAX;
+			if ((constrain_ise_weight_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS))
+				usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15);
+			else if ((constrain_ise_weight_selectors) && (ise_weight_range == astc_helpers::BISE_12_LEVELS))
+				usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3);
+
+			double trial_blk_error = eval_selectors_dual_plane(channel_index, num_pixels, trial_weights0, trial_weights1, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask);
+			if (trial_blk_error < cur_block_error)
+			{
+				cur_block_error = trial_blk_error;
+				memcpy(pEndpoints, best_trial_endpoints, NUM_MODE11_ENDPOINTS);
+				memcpy(pWeights0, trial_weights0, num_pixels);
+				memcpy(pWeights1, trial_weights1, num_pixels);
+				submode_used = best_submode + 1;
+				improved_flag = true;
+			}
+		}
+	}
+
+	return improved_flag;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+bool pack_mode7(
+	const vec3F& high_color_q16, const float s_q16,
+	uint32_t ise_endpoint_range, uint8_t* pEndpoints,
+	uint32_t ise_weight_range, // only used for determining biasing during packing
+	const astc_hdr_codec_base_options& coptions,
+	int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used)
+{
+	assert(first_submode <= last_submode);
+	assert((first_submode >= 0) && (first_submode <= (int)MAX_MODE7_SUBMODE_INDEX));
+	assert(last_submode <= (int)MAX_MODE7_SUBMODE_INDEX);
+
+	uint8_t unquant_trial_endpoints[NUM_MODE7_ENDPOINTS];
+
+	memset(pEndpoints, 0, NUM_MODE7_ENDPOINTS);
+
+	double best_trial_dist = BIG_FLOAT_VAL;
+	int best_trial_submode = 0;
+
+	for (int submode = first_submode; submode <= last_submode; submode++)
+	{
+		const int MAX_CLAMP_MAG_ACCEPT_THRESH = 16;
+
+		int max_clamp_mag = 0;
+		const bool did_clamp = pack_astc_mode7_submode(submode, unquant_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range, !ignore_clamping, MAX_CLAMP_MAG_ACCEPT_THRESH);
+
+		if (submode < 5)
+		{
+			if (!ignore_clamping)
+			{
+				if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
+					continue;
+			}
+		}
+
+		uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS];
+
+		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
+		// It could massively distort the endpoints, but still result in a valid encoding.
+		basist::astc_6x6_hdr::requantize_ise_endpoints(7, astc_helpers::BISE_256_LEVELS, unquant_trial_endpoints, ise_endpoint_range, trial_endpoints);
+
+		int e[2][3];
+		int decoded_s = 0;
+		if (!decode_mode7_to_qlog12(trial_endpoints, e, &decoded_s, ise_endpoint_range))
+			continue;
+
+		// e1 is always the high color
+		vec3F e1(
+			(float)(e[1][0] << 4),
+			(float)(e[1][1] << 4),
+			(float)(e[1][2] << 4)
+		);
+
+		decoded_s <<= 4;
+
+		double dist = e1.squared_distance_d(high_color_q16) + squared((double)decoded_s - s_q16) * 3;
+
+		if (dist < best_trial_dist)
+		{
+			best_trial_dist = dist;
+			best_trial_submode = submode;
+			memcpy(pEndpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
+		}
+
+		if (coptions.m_take_first_non_clamping_mode7_submode)
+		{
+			if (!did_clamp)
+				break;
+		}
+
+	} // submode
+
+	if ((coptions.m_ultra_quant) &&
+		(ise_endpoint_range < astc_helpers::BISE_256_LEVELS) &&
+		(best_trial_dist != BIG_FLOAT_VAL))
+	{
+		uint8_t orig_best_trial_endpoints[NUM_MODE7_ENDPOINTS];
+		memcpy(orig_best_trial_endpoints, pEndpoints, NUM_MODE7_ENDPOINTS);
+
+		vec3F low_color_q16(high_color_q16 - vec3F(s_q16));
+		low_color_q16.clamp(0.0f, 65535.0f);
+
+		for (uint32_t c = 0; c < NUM_MODE7_ENDPOINTS; c++)
+		{
+			for (int dt = 0; dt <= 1; dt++)
+			{
+				const int d = dt ? 1 : -1;
+
+				uint8_t varied_endpoints[NUM_MODE7_ENDPOINTS];
+				memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE7_ENDPOINTS);
+
+				int ise = varied_endpoints[c];
+
+				int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise];
+				rank = clamp<int>(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1);
+
+				ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank];
+
+				varied_endpoints[c] = (uint8_t)ise;
+
+				int e[2][3];
+				int decoded_s = 0;
+				if (!decode_mode7_to_qlog12(varied_endpoints, e, &decoded_s, ise_endpoint_range))
+					continue;
+
+				// e1 is always the high color
+				vec3F e1(
+					(float)(e[1][0] << 4),
+					(float)(e[1][1] << 4),
+					(float)(e[1][2] << 4)
+				);
+
+				decoded_s <<= 4;
+
+				double dist = e1.squared_distance_d(high_color_q16) + squared((double)decoded_s - s_q16) * 3;
+
+				if (dist < best_trial_dist)
+				{
+					best_trial_dist = dist;
+					memcpy(pEndpoints, varied_endpoints, NUM_MODE7_ENDPOINTS);
+				}
+
+			} // d
+		} // c
+	}
+
+	submode_used = best_trial_submode;
+
+	return (best_trial_dist != BIG_FLOAT_VAL);
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+bool try_mode7(
+	uint32_t num_pixels,
+	uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
+	const vec3F& high_color_q16, const float s_q16,
+	const half_float block_pixels_half[][3],
+	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions,
+	uint32_t ise_endpoint_range,
+	int32_t first_submode, int32_t last_submode)
+{
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+
+	assert(first_submode <= last_submode);
+	assert((first_submode >= 0) && (first_submode <= (int)MAX_MODE7_SUBMODE_INDEX));
+	assert(last_submode <= (int)MAX_MODE7_SUBMODE_INDEX);
+	assert(num_weight_levels == astc_helpers::get_ise_levels(ise_weight_range));
+
+	uint8_t unquant_trial_endpoints[NUM_MODE7_ENDPOINTS];
+
+	uint8_t best_trial_endpoints[NUM_MODE7_ENDPOINTS];
+	clear_obj(best_trial_endpoints);
+	double best_trial_dist = BIG_FLOAT_VAL;
+	int best_trial_submode = 0;
+		
+	for (int submode = first_submode; submode <= last_submode; submode++)
+	{
+		const int MAX_CLAMP_MAG_ACCEPT_THRESH = 16;
+
+		int max_clamp_mag = 0;
+		const bool did_clamp = pack_astc_mode7_submode(submode, unquant_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range, true, MAX_CLAMP_MAG_ACCEPT_THRESH);
+
+		if (submode < 5)
+		{
+			if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
+				continue;
+		}
+
+		uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS];
+
+		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
+		// It could massively distort the endpoints, but still result in a valid encoding.
+		basist::astc_6x6_hdr::requantize_ise_endpoints(7, astc_helpers::BISE_256_LEVELS, unquant_trial_endpoints, ise_endpoint_range, trial_endpoints);
+
+		int e[2][3];
+		int decoded_s = 0;
+		if (!decode_mode7_to_qlog12(trial_endpoints, e, &decoded_s, ise_endpoint_range))
+			continue;
+
+		// e1 is always the high color
+		vec3F e1(
+			(float)(e[1][0] << 4),
+			(float)(e[1][1] << 4),
+			(float)(e[1][2] << 4)
+		);
+
+		decoded_s <<= 4;
+
+		double dist = e1.squared_distance_d(high_color_q16) + squared((double)decoded_s - s_q16) * 3;
+
+		if (dist < best_trial_dist)
+		{
+			best_trial_dist = dist;
+			best_trial_submode = submode;
+			memcpy(best_trial_endpoints, trial_endpoints, sizeof(best_trial_endpoints));
+		}
+
+		if (coptions.m_take_first_non_clamping_mode7_submode)
+		{
+			if (!did_clamp)
+				break;
+		}
+
+	} // submode
+
+	if ((coptions.m_ultra_quant) &&
+		(ise_endpoint_range < astc_helpers::BISE_256_LEVELS) &&
+		(best_trial_dist != BIG_FLOAT_VAL))
+	{
+		uint8_t orig_best_trial_endpoints[NUM_MODE7_ENDPOINTS];
+		memcpy(orig_best_trial_endpoints, best_trial_endpoints, NUM_MODE7_ENDPOINTS);
+
+		vec3F low_color_q16(high_color_q16 - vec3F(s_q16));
+		low_color_q16.clamp(0.0f, 65535.0f);
+
+		for (uint32_t c = 0; c < NUM_MODE7_ENDPOINTS; c++)
+		{
+			for (int dt = 0; dt <= 1; dt++)
+			{
+				const int d = dt ? 1 : -1;
+
+				uint8_t varied_endpoints[NUM_MODE7_ENDPOINTS];
+				memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE7_ENDPOINTS);
+
+				int ise = varied_endpoints[c];
+
+				int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise];
+				rank = clamp<int>(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1);
+
+				ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank];
+
+				varied_endpoints[c] = (uint8_t)ise;
+
+				int e[2][3];
+				int decoded_s = 0;
+				if (!decode_mode7_to_qlog12(varied_endpoints, e, &decoded_s, ise_endpoint_range))
+					continue;
+
+				// e1 is always the high color
+				vec3F e1(
+					(float)(e[1][0] << 4),
+					(float)(e[1][1] << 4),
+					(float)(e[1][2] << 4)
+				);
+
+				decoded_s <<= 4;
+
+				double dist = e1.squared_distance_d(high_color_q16) + squared((double)decoded_s - s_q16) * 3;
+
+				if (dist < best_trial_dist)
+				{
+					best_trial_dist = dist;
+					memcpy(best_trial_endpoints, varied_endpoints, NUM_MODE7_ENDPOINTS);
+				}
+
+			} // d
+		} // c
+	}
+
+	bool improved_flag = false;
+
+	if (best_trial_dist != BIG_FLOAT_VAL)
+	{
+		half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
+		uint8_t trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+
+		if (get_astc_hdr_mode_7_block_colors(best_trial_endpoints, &decoded_half[0][0], nullptr, num_weight_levels, ise_weight_range, ise_endpoint_range))
+		{
+			double trial_blk_error = eval_selectors(num_pixels, trial_weights, ise_weight_range, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions);
+			if (trial_blk_error < cur_block_error)
+			{
+				cur_block_error = trial_blk_error;
+				memcpy(pEndpoints, best_trial_endpoints, NUM_MODE7_ENDPOINTS);
+				memcpy(pWeights, trial_weights, num_pixels);
+				submode_used = best_trial_submode;
+				improved_flag = true;
+			}
+		}
+	}
+
+	return improved_flag;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+const float LOW_EMPHASIS_WEIGHT = 1.0f, MIDDLE_EMPHASIS_WEIGHT = 1.25f, HIGH_EMPHASIS_WEIGHT = 1.0f;
+const float LOW_EMPHASIS_WEIGHT_HEAVY = 1.0f, MIDDLE_EMPHASIS_WEIGHT_HEAVY = 4.0f, HIGH_EMPHASIS_WEIGHT_HEAVY = 1.0f;
+
+double encode_astc_hdr_block_mode_11(
+	uint32_t num_pixels,
+	const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+	uint32_t ise_weight_range,
+	uint32_t& best_submode,
+	double cur_block_error,
+	uint8_t* blk_endpoints, uint8_t* blk_weights,
+	const astc_hdr_codec_base_options& coptions,
+	bool direct_only,
+	uint32_t ise_endpoint_range,
+	bool uber_mode,
+	bool constrain_ise_weight_selectors,
+	int32_t first_submode, int32_t last_submode, bool ignore_clamping, opt_mode_t opt_mode,
+	const encode_astc_block_stats* pBlock_stats)
+{
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+
+	assert((first_submode >= FIRST_MODE11_SUBMODE_INDEX) && (first_submode <= last_submode));
+	assert(last_submode <= MAX_MODE11_SUBMODE_INDEX);
+
+	best_submode = 0;
+
+	const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
+	assert(num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS);
+
+	vec3F block_mean_color_q16, block_axis_q16;
+	if (!pBlock_stats)
+	{
+		block_mean_color_q16 = calc_mean(num_pixels, pBlock_pixels_q16);
+		block_axis_q16 = calc_rgb_pca(num_pixels, pBlock_pixels_q16, block_mean_color_q16);
+	}
+	else
+	{
+		assert(num_pixels == pBlock_stats->m_num_pixels);
+		block_mean_color_q16 = pBlock_stats->m_mean_q16;
+		block_axis_q16 = pBlock_stats->m_axis_q16;
+	}
+
+	aabb3F color_box_q16(cInitExpand);
+
+	float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL;
+	vec3F low_color_q16, high_color_q16;
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		color_box_q16.expand(pBlock_pixels_q16[i]);
+
+		vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16);
+		float kd = k.dot(block_axis_q16);
+
+		if (kd < l)
+		{
+			l = kd;
+			low_color_q16 = pBlock_pixels_q16[i];
+		}
+
+		if (kd > h)
+		{
+			h = kd;
+			high_color_q16 = pBlock_pixels_q16[i];
+		}
+	}
+		
+	vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16);
+	
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		low_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f);
+		high_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f);
+	}
+
+	uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS];
+	uint8_t trial_blk_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	uint32_t trial_best_submode = 0;
+
+	clear_obj(trial_blk_endpoints);
+	clear_obj(trial_blk_weights);
+
+	double trial_blk_error = BIG_FLOAT_VAL;
+			
+	bool did_improve = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
+		low_color_q16, high_color_q16,
+		pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+		first_submode, last_submode, ignore_clamping);
+
+	// If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
+	if (!did_improve)
+		return cur_block_error;
+
+	// Did the solution improve?
+	if (trial_blk_error < cur_block_error)
+	{
+		cur_block_error = trial_blk_error;
+		memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS);
+		memcpy(blk_weights, trial_blk_weights, num_pixels);
+		best_submode = trial_best_submode;
+	}
+
+	if (opt_mode == cNoOpt)
+		return cur_block_error;
+
+	// least squares on the most promising trial weight indices found
+	const uint32_t NUM_LS_PASSES = 3;
+
+	float emphasis_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+
+	if (opt_mode == cWeightedAverage)
+	{
+		const uint32_t NUM_OPT_PASSES = 3;
+		for (uint32_t pass = 0; pass < NUM_OPT_PASSES; pass++)
+		{
+			vec3F low_p(0.0f);
+			float total_low = 0.0f;
+
+			vec3F high_p(0.0f);
+			float total_high = 0.0f;
+
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				vec3F p(pBlock_pixels_q16[i]);
+				float lerp = g_ise_weight_lerps[ise_weight_range][trial_blk_weights[i] + 1] * (1.0f / 64.0f);
+
+				low_p += p * (1.0f - lerp);
+				total_low += (1.0f - lerp);
+
+				high_p += p * lerp;
+				total_high += lerp;
+			}
+
+			if (total_low != 0.0f)
+				low_p *= (1.0f / total_low);
+
+			if (total_high != 0.0f)
+				high_p *= (1.0f / total_high);
+
+			vec3F low, high;
+
+			bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				low_p, high_p,
+				pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+				first_submode, last_submode, ignore_clamping);
+
+			if (!was_improved)
+				break;
+
+			memcpy(trial_blk_weights, blk_weights, num_pixels);
+		}
+	}
+	else if (opt_mode == cOrdinaryLeastSquares)
+	{
+		for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++)
+		{
+			vec3F l_q16, h_q16;
+
+			if (!compute_least_squares_endpoints_rgb(num_pixels, trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16))
+				break;
+			
+			bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				l_q16, h_q16,
+				pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+				first_submode, last_submode, ignore_clamping);
+
+			if (!was_improved)
+				break;
+
+			// It's improved, so let's take the new weight indices.
+			memcpy(trial_blk_weights, blk_weights, num_pixels);
+
+		} // pass
+	}
+	else
+	{
+		if (h == l)
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+				emphasis_weights[i] = 1.0f;
+		}
+		else
+		{
+			float mid = (0.0f - l) / (h - l);
+			mid = clamp(mid, .01f, .99f);
+
+			float lw = LOW_EMPHASIS_WEIGHT, mw = MIDDLE_EMPHASIS_WEIGHT, hw = HIGH_EMPHASIS_WEIGHT;
+			if (opt_mode == cWeightedLeastSquaresHeavy)
+				lw = LOW_EMPHASIS_WEIGHT_HEAVY, mw = MIDDLE_EMPHASIS_WEIGHT_HEAVY, hw = HIGH_EMPHASIS_WEIGHT_HEAVY;
+						
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16);
+				float kd = k.dot(block_axis_q16);
+
+				assert((kd >= l) && (kd <= h));
+
+				float v = (kd - l) / (h - l);
+										
+				if (v < mid)
+					v = lerp(lw, mw, v / mid);
+				else
+					v = lerp(mw, hw, (v - mid) * (1.0f - mid));
+
+				emphasis_weights[i] = v;
+			}
+
+#if 0
+			if (num_pixels == 6 * 6)
+			{
+				const float EDGE_WEIGHT = .1f;
+				for (uint32_t i = 0; i < 6; i++)
+				{
+					emphasis_weights[i] += EDGE_WEIGHT;
+					emphasis_weights[i + 5 * 6] += EDGE_WEIGHT;
+					emphasis_weights[i * 6] += EDGE_WEIGHT;
+					emphasis_weights[5 + i * 6] += EDGE_WEIGHT;
+				}
+			}
+#endif
+		}
+
+		for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++)
+		{
+			vec3F l_q16, h_q16;
+
+			if (!compute_weighted_least_squares_endpoints_rgb(
+				num_pixels,
+				trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], nullptr,
+				emphasis_weights,
+				&l_q16, &h_q16,
+				pBlock_pixels_q16,
+				color_box_q16))
+				break;
+
+			bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				l_q16, h_q16,
+				pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+				first_submode, last_submode, ignore_clamping);
+
+			if (!was_improved)
+				break;
+
+			// It's improved, so let's take the new weight indices.
+			memcpy(trial_blk_weights, blk_weights, num_pixels);
+
+		} // pass
+	}
+
+	if ( (uber_mode) && (ise_weight_range >= astc_helpers::BISE_3_LEVELS) &&
+		((opt_mode == cOrdinaryLeastSquares) || (opt_mode == cWeightedLeastSquares) || (opt_mode == cWeightedLeastSquaresHeavy)) )
+	{
+		// Try varying the current best weight indices. This can be expanded/improved, but at potentially great cost.
+
+		uint8_t temp_astc_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+		memcpy(temp_astc_weights, trial_blk_weights, num_pixels);
+
+		uint32_t min_lin_sel = 256, max_lin_sel = 0;
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const uint32_t astc_sel = temp_astc_weights[i];
+
+			const uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
+			assert(lin_sel < num_weight_levels);
+
+			min_lin_sel = minimumu(min_lin_sel, lin_sel);
+			max_lin_sel = maximumu(max_lin_sel, lin_sel);
+		}
+
+		bool was_improved = false;
+		(void)was_improved;
+
+		{
+			bool weights_changed = false;
+			uint8_t trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				uint32_t astc_sel = temp_astc_weights[i];
+				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
+
+				if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1)))
+				{
+					lin_sel++;
+					weights_changed = true;
+				}
+
+				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
+			}
+
+			if (weights_changed)
+			{
+				vec3F l_q16, h_q16;
+
+				bool succeeded;
+				if (opt_mode == cOrdinaryLeastSquares)
+					succeeded = compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16);
+				else
+					succeeded = compute_weighted_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], nullptr, emphasis_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16);
+
+				if (succeeded)
+				{
+					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+						l_q16, h_q16,
+						pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+						first_submode, last_submode, ignore_clamping))
+					{
+						was_improved = true;
+					}
+				}
+			}
+		}
+
+		{
+			bool weights_changed = false;
+			uint8_t trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				uint32_t astc_sel = temp_astc_weights[i];
+				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
+
+				if ((lin_sel == max_lin_sel) && (lin_sel > 0))
+				{
+					lin_sel--;
+					weights_changed = true;
+				}
+
+				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
+			}
+
+			if (weights_changed)
+			{
+				vec3F l_q16, h_q16;
+
+				bool succeeded;
+				if (opt_mode == cOrdinaryLeastSquares)
+					succeeded = compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16);
+				else
+					succeeded = compute_weighted_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], nullptr, emphasis_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16);
+
+				if (succeeded)
+				{
+					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+						l_q16, h_q16,
+						pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+						first_submode, last_submode, ignore_clamping))
+					{
+						was_improved = true;
+					}
+				}
+			}
+		}
+
+		{
+			bool weights_changed = false;
+			uint8_t trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				uint32_t astc_sel = temp_astc_weights[i];
+				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
+
+				if ((lin_sel == max_lin_sel) && (lin_sel > 0))
+				{
+					lin_sel--;
+					weights_changed = true;
+				}
+				else if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1)))
+				{
+					lin_sel++;
+					weights_changed = true;
+				}
+
+				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
+			}
+
+			if (weights_changed)
+			{
+				vec3F l_q16, h_q16;
+				bool succeeded;
+				if (opt_mode == cOrdinaryLeastSquares)
+					succeeded = compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16);
+				else
+					succeeded = compute_weighted_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], nullptr, emphasis_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16);
+
+				if (succeeded)
+				{
+					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+						l_q16, h_q16,
+						pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+						first_submode, last_submode, ignore_clamping))
+					{
+						was_improved = true;
+					}
+				}
+			}
+		}
+
+	} // uber_mode
+
+	return cur_block_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+double encode_astc_hdr_block_downsampled_mode_11(
+	uint32_t block_x, uint32_t block_y, uint32_t grid_x, uint32_t grid_y,
+	uint32_t ise_weight_range, uint32_t ise_endpoint_range,
+	uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+	double cur_block_error,
+	int32_t first_submode, int32_t last_submode, bool ignore_clamping, opt_mode_t opt_mode,
+	uint8_t* pBlk_endpoints, uint8_t* pBlk_weights, uint32_t& best_submode,
+	const astc_hdr_codec_base_options& coptions,
+	const encode_astc_block_stats* pBlock_stats)
+{
+	assert((block_x >= 4) && (block_y >= 4) && (block_x <= MAX_ASTC_HDR_BLOCK_W) && (block_y <= MAX_ASTC_HDR_BLOCK_H));
+	assert((grid_x >= 2) && (grid_y >= 2) && (grid_x <= block_x) && (grid_y <= block_y));
+
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+
+	assert((first_submode >= FIRST_MODE11_SUBMODE_INDEX) && (first_submode <= last_submode));
+	assert(last_submode <= MAX_MODE11_SUBMODE_INDEX);
+
+	best_submode = 0;
+
+	assert(astc_helpers::get_ise_levels(ise_weight_range) <= MAX_SUPPORTED_WEIGHT_LEVELS);
+
+	const uint32_t num_weights = grid_x * grid_y;
+
+	vec3F block_mean_color_q16, block_axis_q16;
+	if (!pBlock_stats)
+	{
+		block_mean_color_q16 = calc_mean(num_pixels, pBlock_pixels_q16);
+		block_axis_q16 = calc_rgb_pca(num_pixels, pBlock_pixels_q16, block_mean_color_q16);
+	}
+	else
+	{
+		assert(num_pixels == pBlock_stats->m_num_pixels);
+		block_mean_color_q16 = pBlock_stats->m_mean_q16;
+		block_axis_q16 = pBlock_stats->m_axis_q16;
+	}
+
+	aabb3F color_box_q16(cInitExpand);
+
+	float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL;
+	vec3F low_color_q16, high_color_q16;
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		color_box_q16.expand(pBlock_pixels_q16[i]);
+
+		vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16);
+		float kd = k.dot(block_axis_q16);
+
+		if (kd < l)
+		{
+			l = kd;
+			low_color_q16 = pBlock_pixels_q16[i];
+		}
+
+		if (kd > h)
+		{
+			h = kd;
+			high_color_q16 = pBlock_pixels_q16[i];
+		}
+	}
+
+	vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16);
+
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		low_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f);
+		high_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f);
+	}
+
+	const uint32_t NUM_PASSES = 3;
+	for (uint32_t pass = 0; pass < NUM_PASSES; pass++)
+	{
+		uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS];
+		uint8_t trial_blk_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // at block resolution, not grid res
+		uint32_t trial_best_submode = 0;
+
+		clear_obj(trial_blk_endpoints);
+		clear_obj(trial_blk_weights);
+				
+		double trial_blk_error = BIG_FLOAT_VAL;
+
+		bool could_pack = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
+			low_color_q16, high_color_q16,
+			pBlock_pixels_half, 32, astc_helpers::BISE_32_LEVELS, coptions, false, ise_endpoint_range, false,
+			first_submode, last_submode, ignore_clamping);
+
+		if (!could_pack)
+			break;
+
+		uint8_t trial_downsampled_ise_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+
+		downsample_ise_weights(
+			astc_helpers::BISE_32_LEVELS, ise_weight_range,
+			block_x, block_y, grid_x, grid_y,
+			trial_blk_weights, trial_downsampled_ise_weights);
+
+		uint8_t trial_downsampled_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+		dequantize_astc_weights(num_weights, trial_downsampled_ise_weights, ise_weight_range, trial_downsampled_raw_weights);
+
+		uint8_t trial_upsampled_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE
+		astc_helpers::upsample_weight_grid(block_x, block_y, grid_x, grid_y, trial_downsampled_raw_weights, trial_upsampled_raw_weights);
+
+		//------
+
+		int trial_e[2][3];
+		if (!decode_mode11_to_qlog12(trial_blk_endpoints, trial_e, ise_endpoint_range))
+			return cur_block_error;
+
+		double trial_error = compute_block_error_from_raw_weights(num_pixels, pBlock_pixels_half, trial_upsampled_raw_weights, trial_e, coptions);
+
+		if (trial_error < cur_block_error)
+		{
+			cur_block_error = trial_error;
+			memcpy(pBlk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS);
+			memcpy(pBlk_weights, trial_downsampled_ise_weights, num_weights);
+			best_submode = trial_best_submode;
+		}
+		else if (pass)
+			break;
+						
+		if ((opt_mode == cWeightedLeastSquares) || (opt_mode == cWeightedLeastSquaresHeavy))
+		{
+			float emphasis_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+			if (h == l)
+			{
+				for (uint32_t i = 0; i < num_pixels; i++)
+					emphasis_weights[i] = 1.0f;
+			}
+			else
+			{
+				float mid = (0.0f - l) / (h - l);
+				mid = clamp(mid, .01f, .99f);
+
+				float lw = LOW_EMPHASIS_WEIGHT, mw = MIDDLE_EMPHASIS_WEIGHT, hw = HIGH_EMPHASIS_WEIGHT;
+				if (opt_mode == cWeightedLeastSquaresHeavy)
+					lw = LOW_EMPHASIS_WEIGHT_HEAVY, mw = MIDDLE_EMPHASIS_WEIGHT_HEAVY, hw = HIGH_EMPHASIS_WEIGHT_HEAVY;
+
+				for (uint32_t i = 0; i < num_pixels; i++)
+				{
+					vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16);
+					float kd = k.dot(block_axis_q16);
+
+					assert((kd >= l) && (kd <= h));
+
+					float v = (kd - l) / (h - l);
+
+					if (v < mid)
+						v = lerp(lw, mw, v / mid);
+					else
+						v = lerp(mw, hw, (v - mid) * (1.0f - mid));
+
+					emphasis_weights[i] = v;
+				}
+			}
+
+			float trial_upsampled_raw_weightsf[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+			for (uint32_t i = 0; i < num_pixels; i++)
+				trial_upsampled_raw_weightsf[i] = (float)trial_upsampled_raw_weights[i] * (1.0f / 64.0f);
+
+			if (!compute_weighted_least_squares_endpoints_rgb(num_pixels, nullptr, nullptr, trial_upsampled_raw_weightsf, emphasis_weights, &low_color_q16, &high_color_q16, pBlock_pixels_q16, color_box_q16))
+				return false;
+		}
+		else
+		{
+			if (!compute_least_squares_endpoints_rgb_raw_weights(num_pixels, trial_upsampled_raw_weights, &low_color_q16, &high_color_q16, pBlock_pixels_q16, color_box_q16))
+				break;
+		}
+
+		bool pack_succeeded = pack_mode11(low_color_q16, high_color_q16, ise_endpoint_range, trial_blk_endpoints, coptions, false, first_submode, last_submode, false, trial_best_submode);
+		if (!pack_succeeded)
+			break;
+
+		if (!decode_mode11_to_qlog12(trial_blk_endpoints, trial_e, ise_endpoint_range))
+			break;
+
+		trial_error = compute_block_error_from_raw_weights(num_pixels, pBlock_pixels_half, trial_upsampled_raw_weights, trial_e, coptions);
+
+		if (trial_error < cur_block_error)
+		{
+			cur_block_error = trial_error;
+			memcpy(pBlk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS);
+			memcpy(pBlk_weights, trial_downsampled_ise_weights, num_weights);
+			best_submode = trial_best_submode;
+		}
+		else
+		{
+			break;
+		}
+
+    } // pass
+
+	return cur_block_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+double encode_astc_hdr_block_mode_11_dual_plane(
+	uint32_t num_pixels,
+	const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+	uint32_t channel_index,		// 0-2
+	uint32_t ise_weight_range,
+	uint32_t& best_submode,
+	double cur_block_error,
+	uint8_t* blk_endpoints, uint8_t* blk_weights0, uint8_t* blk_weights1,
+	const astc_hdr_codec_base_options& coptions,
+	bool direct_only,
+	uint32_t ise_endpoint_range,
+	bool uber_mode,
+	bool constrain_ise_weight_selectors,
+	int32_t first_submode, int32_t last_submode, bool ignore_clamping)
+{
+	(void)uber_mode;
+
+	assert(channel_index <= 2);
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+
+	assert((first_submode >= FIRST_MODE11_SUBMODE_INDEX) && (first_submode <= last_submode));
+	assert(last_submode <= MAX_MODE11_SUBMODE_INDEX);
+	
+	assert(num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS);
+
+	best_submode = 0;
+
+	const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
+	assert(num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS);
+
+	vec4F temp_block_pixels_q16[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		temp_block_pixels_q16[i] = pBlock_pixels_q16[i];
+		temp_block_pixels_q16[i][channel_index] = 0.0f;
+	}
+
+	vec3F block_mean_color_q16(calc_mean(num_pixels, temp_block_pixels_q16));
+	vec3F block_axis_q16(calc_rgb_pca(num_pixels, temp_block_pixels_q16, block_mean_color_q16));
+
+	float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL;
+	vec3F low_color_q16, high_color_q16;
+
+	aabb3F color_box_q16(cInitExpand);
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		color_box_q16.expand(pBlock_pixels_q16[i]);
+
+		vec3F k(vec3F(temp_block_pixels_q16[i]) - block_mean_color_q16);
+		float kd = k.dot(block_axis_q16);
+
+		if (kd < l)
+		{
+			l = kd;
+			low_color_q16 = pBlock_pixels_q16[i];
+		}
+
+		if (kd > h)
+		{
+			h = kd;
+			high_color_q16 = pBlock_pixels_q16[i];
+		}
+	}
+
+	low_color_q16[channel_index] = 0.0f;
+	high_color_q16[channel_index] = 0.0f;
+
+	float a = low_color_q16.dot(vec3F(1.0f)), b = high_color_q16.dot(vec3F(1.0f));
+	if (a <= b)
+	{
+		low_color_q16[channel_index] = color_box_q16.get_low()[channel_index];
+		high_color_q16[channel_index] = color_box_q16.get_high()[channel_index];
+	}
+	else
+	{
+		high_color_q16[channel_index] = color_box_q16.get_low()[channel_index];
+		low_color_q16[channel_index] = color_box_q16.get_high()[channel_index];
+	}
+
+	vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16);
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		low_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f);
+		high_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f);
+	}
+
+	uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS];
+	uint8_t trial_blk_weights0[MAX_ASTC_HDR_ENC_BLOCK_PIXELS], trial_blk_weights1[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	uint32_t trial_best_submode = 0;
+
+	clear_obj(trial_blk_endpoints);
+	clear_obj(trial_blk_weights0);
+	clear_obj(trial_blk_weights1);
+
+	double trial_blk_error = BIG_FLOAT_VAL;
+
+	bool did_improve = try_mode11_dual_plane(channel_index, num_pixels, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_best_submode,
+		low_color_q16, high_color_q16, 
+		pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+		first_submode, last_submode, ignore_clamping);
+
+	// If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
+	if (!did_improve)
+		return cur_block_error;
+
+	// Did the solution improve?
+	if (trial_blk_error < cur_block_error)
+	{
+		cur_block_error = trial_blk_error;
+		memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS);
+		memcpy(blk_weights0, trial_blk_weights0, num_pixels);
+		memcpy(blk_weights1, trial_blk_weights1, num_pixels);
+		best_submode = trial_best_submode;
+	}
+
+	const uint32_t chan0 = (channel_index + 1) % 3, chan1 = (channel_index + 2) % 3;
+
+	vec2F plane0_q16[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	aabb2F plane0_bounds;
+	plane0_bounds[0].set(color_box_q16.get_low()[chan0], color_box_q16.get_low()[chan1]);
+	plane0_bounds[1].set(color_box_q16.get_high()[chan0], color_box_q16.get_high()[chan1]);
+
+	vec1F plane1_q16[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	aabb1F plane1_bounds;
+	plane1_bounds[0].set(color_box_q16.get_low()[channel_index]);
+	plane1_bounds[1].set(color_box_q16.get_high()[channel_index]);
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		plane0_q16[i][0] = pBlock_pixels_q16[i][chan0];
+		plane0_q16[i][1] = pBlock_pixels_q16[i][chan1];
+
+		plane1_q16[i][0] = pBlock_pixels_q16[i][channel_index];
+	}
+
+	const uint32_t NUM_LS_PASSES = 3;
+
+	for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++)
+	{
+		vec2F l0_q16, h0_q16;
+		if (!compute_least_squares_endpoints_2D(num_pixels, trial_blk_weights0, &g_astc_ls_weights_ise[ise_weight_range][0], &l0_q16, &h0_q16, plane0_q16, plane0_bounds))
+			break;
+
+		vec1F l1_q16, h1_q16;
+		if (!compute_least_squares_endpoints_1D(num_pixels, trial_blk_weights1, &g_astc_ls_weights_ise[ise_weight_range][0], &l1_q16, &h1_q16, plane1_q16, plane1_bounds))
+			break;
+
+		vec3F l_q16, h_q16;
+
+		l_q16[channel_index] = l1_q16[0];
+		h_q16[channel_index] = h1_q16[0];
+
+		l_q16[chan0] = l0_q16[0];
+		h_q16[chan0] = h0_q16[0];
+
+		l_q16[chan1] = l0_q16[1];
+		h_q16[chan1] = h0_q16[1];
+
+		bool was_improved = try_mode11_dual_plane(channel_index, num_pixels, blk_endpoints, blk_weights0, blk_weights1, cur_block_error, best_submode,
+			l_q16, h_q16,
+			pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors,
+			first_submode, last_submode, ignore_clamping);
+
+		if (!was_improved)
+			break;
+
+		// It's improved, so let's take the new weight indices.
+		memcpy(trial_blk_weights0, blk_weights0, num_pixels);
+		memcpy(trial_blk_weights1, blk_weights1, num_pixels);
+
+	} // pass
+	
+	return cur_block_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+double encode_astc_hdr_block_mode_7(
+	uint32_t num_pixels,
+	const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+	uint32_t ise_weight_range,
+	uint32_t& best_submode,
+	double cur_block_error,
+	uint8_t* blk_endpoints,  //[4]
+	uint8_t* blk_weights, // [num_pixels]
+	const astc_hdr_codec_base_options& coptions,
+	uint32_t ise_endpoint_range, 
+	int first_submode, int last_submode,
+	const encode_astc_block_stats* pBlock_stats)
+{
+	assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS));
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
+
+	const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
+	assert(num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS);
+
+	best_submode = 0;
+
+	vec3F block_mean_color_q16;
+	if (!pBlock_stats)
+		block_mean_color_q16 = calc_mean(num_pixels, pBlock_pixels_q16);
+	else
+	{
+		assert(num_pixels == pBlock_stats->m_num_pixels);
+		block_mean_color_q16 = pBlock_stats->m_mean_q16;
+	}
+
+	vec3F block_axis_q16(0.577350259f);
+
+	aabb3F color_box_q16(cInitExpand);
+
+	float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL;
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		color_box_q16.expand(pBlock_pixels_q16[i]);
+
+		vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16);
+		float kd = k.dot(block_axis_q16);
+
+		l = basisu::minimum<float>(l, kd);
+		h = basisu::maximum<float>(h, kd);
+	}
+
+	vec3F low_color_q16(interp_color(block_mean_color_q16, block_axis_q16, l, color_box_q16, color_box_q16));
+	vec3F high_color_q16(interp_color(block_mean_color_q16, block_axis_q16, h, color_box_q16, color_box_q16));
+
+	low_color_q16.clamp(0.0f, MAX_QLOG16_VAL);
+	high_color_q16.clamp(0.0f, MAX_QLOG16_VAL);
+
+	vec3F diff(high_color_q16 - low_color_q16);
+
+	// The mul here (* block_axis_q16[0]) is because the "S" or scale value is subtracted from the high color with a scale of 1.0, 
+	// i.e. it's equivalent to a vector of (1,1,1) multiplied by scale before the sub. We want to actually move along the grayscale axis, or (0.577350259, 0.577350259, 0.577350259).
+	float s_q16 = diff.dot(block_axis_q16) * block_axis_q16[0];
+
+	uint8_t trial_blk_endpoints[NUM_MODE7_ENDPOINTS];
+	uint8_t trial_blk_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	uint32_t trial_best_submode = 0;
+
+	clear_obj(trial_blk_endpoints);
+	clear_obj(trial_blk_weights);
+
+	double trial_blk_error = BIG_FLOAT_VAL;
+
+	bool did_improve = try_mode7(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
+		high_color_q16, ceilf(s_q16),
+		pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode);
+
+	// If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
+	if (!did_improve)
+	{
+		return cur_block_error;
+	}
+
+	// Did the solution improve?
+	if (trial_blk_error < cur_block_error)
+	{
+		cur_block_error = trial_blk_error;
+		memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE7_ENDPOINTS);
+		memcpy(blk_weights, trial_blk_weights, num_pixels);
+		best_submode = trial_best_submode;
+	}
+
+#if 1
+	{
+		//const float TL = 8830.0f;// (float)half_to_qlog16(float_to_half(0.00061f));
+		//const float TH = 41600.0f;// (float)half_to_qlog16(float_to_half(40.0f));
+		//float zl = minimum<float>(color_box_q16[0][0], color_box_q16[0][1], color_box_q16[0][2]);
+		//float zh = minimum<float>(color_box_q16[1][0], color_box_q16[1][1], color_box_q16[1][2]);
+
+		//if ((zl <= TL) && (zh >= TH))
+		{
+			// Try a simpler technique for artifact reduction
+			l = BIG_FLOAT_VAL;
+			h = -BIG_FLOAT_VAL;
+
+			vec3F alt_low_color_q16(0.0f), alt_high_color_q16(0.0f);
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				color_box_q16.expand(pBlock_pixels_q16[i]);
+
+				vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16);
+				float kd = k.dot(block_axis_q16);
+
+				if (kd < l)
+				{
+					alt_low_color_q16 = pBlock_pixels_q16[i];
+					l = kd;
+				}
+
+				if (kd > h)
+				{
+					alt_high_color_q16 = pBlock_pixels_q16[i];
+					h = kd;
+				}
+			}
+
+			vec3F old_alt_low_color_q16(alt_low_color_q16);
+
+			for (uint32_t i = 0; i < 3; i++)
+				alt_low_color_q16[i] = lerp<float>(old_alt_low_color_q16[i], alt_high_color_q16[i], 1.0f / 64.0f);
+
+			vec3F alt_diff(alt_high_color_q16 - alt_low_color_q16);
+
+			// The mul here (* block_axis_q16[0]) is because the "S" or scale value is subtracted from the high color with a scale of 1.0, 
+			// i.e. it's equivalent to a vector of (1,1,1) multiplied by scale before the sub. We want to actually move along the grayscale axis, or (0.577350259, 0.577350259, 0.577350259).
+			float alt_s_q16 = alt_diff.dot(block_axis_q16) * block_axis_q16[0];
+
+			try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				alt_high_color_q16, ceilf(alt_s_q16),
+				pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode);
+		}
+	}
+#endif
+
+	const float one_over_num_pixels = 1.0f / (float)num_pixels;
+
+	const uint32_t NUM_TRIALS = 2;
+	for (uint32_t trial = 0; trial < NUM_TRIALS; trial++)
+	{
+		// Given a set of selectors and S, try to compute a better high color
+		vec3F new_high_color_q16(block_mean_color_q16);
+
+		int e[2][3];
+		int cur_s = 0;
+		if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, &cur_s, ise_endpoint_range))
+			break;
+
+		cur_s <<= 4;
+
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			uint32_t astc_sel = trial_blk_weights[i];
+			float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f);
+
+			float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels;
+			new_high_color_q16[0] += k;
+			new_high_color_q16[1] += k;
+			new_high_color_q16[2] += k;
+		}
+
+		bool improved = try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+			new_high_color_q16, (float)cur_s,
+			pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode);
+
+		if (improved)
+		{
+			memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS);
+			memcpy(trial_blk_weights, blk_weights, num_pixels);
+		}
+
+		// Given a set of selectors and a high color, try to compute a better S.
+		float t = 0.0f;
+
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			uint32_t astc_sel = trial_blk_weights[i];
+			float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f);
+
+			t += (1.0f) - lerp;
+		}
+
+		t *= one_over_num_pixels;
+
+		//int e[2][3];
+		if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, nullptr, ise_endpoint_range))
+			break;
+
+		vec3F cur_h_q16((float)(e[1][0] << 4), (float)(e[1][1] << 4), (float)(e[1][2] << 4));
+
+		if (fabs(t) > .0000125f)
+		{
+			float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t;
+			float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t;
+			float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t;
+
+			// TODO: gather statistics on these
+			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				cur_h_q16, ceilf(s_r),
+				pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode))
+			{
+				improved = true;
+			}
+
+			if (coptions.m_mode7_full_s_optimization)
+			{
+				if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+					cur_h_q16, ceilf(s_g),
+					pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode))
+				{
+					improved = true;
+				}
+
+				if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+					cur_h_q16, ceilf(s_b),
+					pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode))
+				{
+					improved = true;
+				}
+
+				if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+					cur_h_q16, ceilf((s_r + s_g + s_b) / 3.0f),
+					pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode))
+				{
+					improved = true;
+				}
+
+				// Added this - quite strong.
+				if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+					cur_h_q16, minimum(maximum(s_r, s_g, s_b) * 1.1f, 65535.0f),
+					pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode))
+				{
+					improved = true;
+				}
+			} // if (coptions.m_mode7_full_s_optimization)
+
+		} // if (fabs(t) > .0000125f)
+
+		if (!improved)
+			break;
+
+		memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS);
+		memcpy(trial_blk_weights, blk_weights, num_pixels);
+
+	} // trial
+
+	return cur_block_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+void dequantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights)
+{
+	const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val;
+
+	for (uint32_t i = 0; i < n; i++)
+		pDst_raw_weights[i] = dequant_tab[pSrc_ise_vals[i]];
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+// For each output (2x2) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_2x2[4][36] = {
+{0.165438f, 0.132609f, 0.092681f, 0.028953f, 0.000000f, 0.000000f, 0.133716f, 0.111240f, 0.065133f, 0.022236f, 0.000000f, 0.000000f, 0.092623f, 0.063898f, 0.039120f, 0.000000f, 0.000000f, 0.000000f, 0.028168f, 0.024184f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.027262f, 0.091051f, 0.132446f, 0.164791f, 0.000000f, 0.000000f, 0.026038f, 0.066511f, 0.111644f, 0.133197f, 0.000000f, 0.000000f, 0.000000f, 0.040053f, 0.064757f, 0.091196f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.024265f, 0.026789f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028282f, 0.024804f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.092871f, 0.066580f, 0.042024f, 0.000000f, 0.000000f, 0.000000f, 0.132115f, 0.107586f, 0.061943f, 0.025551f, 0.000000f, 0.000000f, 0.166111f, 0.132946f, 0.089043f, 0.030145f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.024535f, 0.028835f, 0.000000f, 0.000000f, 0.000000f, 0.044465f, 0.063652f, 0.093251f, 0.000000f, 0.000000f, 0.025961f, 0.063339f, 0.107329f, 0.132240f, 0.000000f, 0.000000f, 0.029844f, 0.089249f, 0.132200f, 0.165099f},
+};
+
+// For each output (3x2) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_3x2[6][36] = {
+{0.257933f, 0.144768f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.213754f, 0.109376f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.140969f, 0.064128f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.041270f, 0.027803f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.046066f, 0.153691f, 0.153395f, 0.042845f, 0.000000f, 0.000000f, 0.038497f, 0.131674f, 0.126804f, 0.041513f, 0.000000f, 0.000000f, 0.028434f, 0.081152f, 0.075499f, 0.025372f, 0.000000f, 0.000000f, 0.000000f, 0.030067f, 0.024989f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.147088f, 0.258980f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.105549f, 0.211746f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.066714f, 0.144015f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.027755f, 0.038152f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.044268f, 0.030990f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.141642f, 0.069930f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.207393f, 0.105354f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.255911f, 0.144511f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.026658f, 0.032535f, 0.000000f, 0.000000f, 0.000000f, 0.024618f, 0.079487f, 0.080415f, 0.026311f, 0.000000f, 0.000000f, 0.038382f, 0.133569f, 0.133162f, 0.033451f, 0.000000f, 0.000000f, 0.043697f, 0.152483f, 0.154345f, 0.040885f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.026401f, 0.040228f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.066688f, 0.142350f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.108504f, 0.210286f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.149666f, 0.255876f},
+};
+
+// For each output (4x2) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_4x2[8][36] = {
+{0.318857f, 0.081413f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.262816f, 0.064811f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.175211f, 0.046152f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.050740f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.163830f, 0.223661f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.128904f, 0.194332f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.080369f, 0.121162f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.041941f, 0.045801f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.230801f, 0.166220f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.193495f, 0.136548f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.113816f, 0.085890f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.043771f, 0.029459f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.087528f, 0.318213f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.059739f, 0.262039f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.046515f, 0.175973f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.049993f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.054078f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.173243f, 0.055145f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.254561f, 0.059695f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.319463f, 0.083816f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.038171f, 0.037447f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.076263f, 0.117360f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.134218f, 0.202503f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.163759f, 0.230278f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.044607f, 0.035170f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.114466f, 0.088407f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.201026f, 0.127983f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.224148f, 0.164194f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.052817f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.043531f, 0.174390f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.060164f, 0.262636f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.089340f, 0.317122f},
+};
+
+// For each output (5x2) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_5x2[10][36] = {
+{0.393855f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.327491f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.216089f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062565f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.303101f, 0.078223f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.261199f, 0.068761f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.160056f, 0.054634f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.074026f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.202529f, 0.207447f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.151013f, 0.157673f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.100074f, 0.095239f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.043623f, 0.042402f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.083336f, 0.309647f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.061432f, 0.269582f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.046328f, 0.166035f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063640f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.397684f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.326178f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.217856f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.058282f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.065541f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.215996f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.321124f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.397338f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.069030f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.159434f, 0.051902f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.266327f, 0.065732f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.305627f, 0.081948f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.038550f, 0.046259f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.092606f, 0.100038f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.162523f, 0.163345f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.199767f, 0.196912f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.066709f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.050841f, 0.169003f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.061591f, 0.265094f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.081426f, 0.305335f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063517f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.210896f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.316133f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.027674f, 0.381781f},
+};
+
+// For each output (6x2) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_6x2[12][36] = {
+{0.395563f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.328397f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.214936f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.061104f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.395041f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.323513f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.208086f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.073360f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.393200f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.317339f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.218679f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.070782f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.399071f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.321356f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.214689f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.064883f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.399159f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.326009f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.212426f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062406f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398973f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.326510f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.217446f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.057071f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.065386f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.215039f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.321113f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398462f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.072234f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.211515f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.319185f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.397066f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.053184f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.213286f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.332634f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.400895f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063501f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.207210f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.334096f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.395193f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.074315f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.216723f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.320827f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.388135f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063571f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.215814f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.325843f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.394772f},
+};
+
+// For each output (2x3) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_2x3[6][36] = {
+{0.253933f, 0.211745f, 0.142964f, 0.043509f, 0.000000f, 0.000000f, 0.146094f, 0.108119f, 0.068727f, 0.024908f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.043336f, 0.140540f, 0.208745f, 0.253069f, 0.000000f, 0.000000f, 0.031333f, 0.069242f, 0.108596f, 0.145138f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.044780f, 0.036916f, 0.026808f, 0.000000f, 0.000000f, 0.000000f, 0.151455f, 0.129189f, 0.076266f, 0.030885f, 0.000000f, 0.000000f, 0.151915f, 0.131628f, 0.081598f, 0.031903f, 0.000000f, 0.000000f, 0.043838f, 0.032645f, 0.030173f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028998f, 0.038454f, 0.046460f, 0.000000f, 0.000000f, 0.033717f, 0.076274f, 0.130140f, 0.153377f, 0.000000f, 0.000000f, 0.025762f, 0.077843f, 0.130195f, 0.150217f, 0.000000f, 0.000000f, 0.000000f, 0.029422f, 0.034493f, 0.044648f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.145243f, 0.107655f, 0.062280f, 0.033041f, 0.000000f, 0.000000f, 0.257369f, 0.210260f, 0.139667f, 0.044485f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.037604f, 0.064104f, 0.105759f, 0.144848f, 0.000000f, 0.000000f, 0.042699f, 0.141511f, 0.207704f, 0.255772f},
+};
+
+// For each output (3x3) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_3x3[9][36] = {
+{0.412913f, 0.237773f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.237370f, 0.111944f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.066531f, 0.251421f, 0.245639f, 0.065785f, 0.000000f, 0.000000f, 0.047059f, 0.143642f, 0.128760f, 0.051164f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.234587f, 0.419421f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.110765f, 0.235227f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.067391f, 0.044131f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.248992f, 0.133218f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.247568f, 0.139987f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.072238f, 0.046475f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.040674f, 0.048555f, 0.000000f, 0.000000f, 0.000000f, 0.049640f, 0.158199f, 0.158521f, 0.046044f, 0.000000f, 0.000000f, 0.043591f, 0.153956f, 0.155258f, 0.049378f, 0.000000f, 0.000000f, 0.000000f, 0.046674f, 0.049509f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.049528f, 0.063611f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.137662f, 0.252612f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.134924f, 0.246668f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.042655f, 0.072341f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.237403f, 0.114850f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.418506f, 0.229241f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.049009f, 0.142093f, 0.136891f, 0.036294f, 0.000000f, 0.000000f, 0.074433f, 0.244437f, 0.251631f, 0.065212f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.121166f, 0.231108f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.236230f, 0.411495f},
+};
+
+// For each output (4x3) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_4x3[12][36] = {
+{0.508292f, 0.132529f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.285382f, 0.073798f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.266624f, 0.378457f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.144380f, 0.210539f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.380292f, 0.270590f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200825f, 0.148293f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.130560f, 0.507542f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.071578f, 0.290320f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.094051f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.322294f, 0.082665f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.316365f, 0.092271f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.092353f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.046081f, 0.061377f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.158151f, 0.235006f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.152896f, 0.232594f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.052844f, 0.061053f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.061619f, 0.046867f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.227763f, 0.158202f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.222620f, 0.155545f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.073398f, 0.053986f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.082287f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.084098f, 0.330283f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.085224f, 0.323658f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.094451f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.286413f, 0.077046f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.512915f, 0.123625f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.140389f, 0.213324f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.267125f, 0.379163f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.208464f, 0.139969f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.382876f, 0.268691f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.080416f, 0.285653f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.131803f, 0.502128f},
+};
+
+// For each output (5x3) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_5x3[15][36] = {
+{0.618662f, 0.032137f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.349200f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.497060f, 0.129255f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.281642f, 0.092043f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.333166f, 0.338337f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.164333f, 0.164165f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.129409f, 0.504176f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.085525f, 0.280890f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.636943f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.363057f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.113467f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.394204f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.386741f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.105588f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.086925f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.317750f, 0.095763f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.321008f, 0.086368f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.092185f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.057696f, 0.061462f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.184995f, 0.197656f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.186342f, 0.186715f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.059712f, 0.065422f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.091939f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.079906f, 0.328876f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.085955f, 0.320229f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.093096f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.099585f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398489f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.388782f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.113144f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.360655f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.639345f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.285578f, 0.088663f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.495946f, 0.129812f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.177513f, 0.166195f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.329950f, 0.326342f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.082692f, 0.279744f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.134353f, 0.503211f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.361178f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.638822f},
+};
+
+// For each output (6x3) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_6x3[18][36] = {
+{0.640623f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359377f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.638697f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.361303f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.640672f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359328f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.637721f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.362279f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.647342f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.352658f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.638418f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.361582f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111041f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.395972f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.387932f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.105054f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.101949f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.395728f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.401263f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.101060f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.098132f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.388180f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.402030f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111659f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.096173f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.393865f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.386312f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.123650f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.104357f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398062f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.393265f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.104316f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.097666f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.400772f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.390396f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111166f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359466f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.640534f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.360569f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.639431f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.355750f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.644250f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.353865f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.646135f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.357727f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.642273f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359539f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.640461f},
+};
+
+// For each output (2x4) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_2x4[8][36] = {
+{0.312206f, 0.261492f, 0.177496f, 0.055798f, 0.000000f, 0.000000f, 0.081944f, 0.062361f, 0.048703f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.054679f, 0.172805f, 0.260561f, 0.314742f, 0.000000f, 0.000000f, 0.000000f, 0.049040f, 0.065652f, 0.082520f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.164115f, 0.129589f, 0.083879f, 0.029309f, 0.000000f, 0.000000f, 0.231202f, 0.198851f, 0.118719f, 0.044334f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.035855f, 0.083276f, 0.127764f, 0.166965f, 0.000000f, 0.000000f, 0.045347f, 0.116503f, 0.193645f, 0.230645f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.223790f, 0.194804f, 0.115855f, 0.047371f, 0.000000f, 0.000000f, 0.164616f, 0.125798f, 0.087268f, 0.040497f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.044738f, 0.118365f, 0.198854f, 0.230745f, 0.000000f, 0.000000f, 0.029646f, 0.078141f, 0.131405f, 0.168106f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.080206f, 0.060505f, 0.041197f, 0.000000f, 0.000000f, 0.000000f, 0.320486f, 0.265233f, 0.174992f, 0.057380f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.051057f, 0.058139f, 0.082120f, 0.000000f, 0.000000f, 0.056168f, 0.174118f, 0.260525f, 0.317873f},
+};
+
+// For each output (3x4) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_3x4[12][36] = {
+{0.503381f, 0.288537f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.130806f, 0.077275f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.088808f, 0.319226f, 0.312498f, 0.086797f, 0.000000f, 0.000000f, 0.000000f, 0.092065f, 0.079421f, 0.021185f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.286250f, 0.514036f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.072999f, 0.126714f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.261935f, 0.133191f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.376226f, 0.207118f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021529f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.059585f, 0.153016f, 0.152552f, 0.043373f, 0.000000f, 0.000000f, 0.063990f, 0.231504f, 0.235283f, 0.060696f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.146403f, 0.262394f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.208547f, 0.382656f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.374676f, 0.209306f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.270440f, 0.145577f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.059636f, 0.233975f, 0.235944f, 0.069029f, 0.000000f, 0.000000f, 0.048950f, 0.150198f, 0.154340f, 0.047929f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200921f, 0.380881f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.146928f, 0.271271f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.128883f, 0.075468f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.509859f, 0.285791f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.095842f, 0.086878f, 0.000000f, 0.000000f, 0.000000f, 0.092942f, 0.314169f, 0.319263f, 0.090906f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.079652f, 0.124852f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.289868f, 0.505628f},
+};
+
+// For each output (4x4) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_4x4[16][36] = {
+{0.665277f, 0.167914f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.166809f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.325854f, 0.449938f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.094690f, 0.129518f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.455174f, 0.326025f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.109174f, 0.109627f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.166733f, 0.664155f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.169112f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.320619f, 0.090788f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.462066f, 0.126527f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.165890f, 0.235855f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.233931f, 0.364324f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.239319f, 0.151533f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.363629f, 0.245519f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.106763f, 0.311932f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.119451f, 0.461853f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.451893f, 0.124086f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.326160f, 0.097861f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.239712f, 0.365585f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.164178f, 0.230525f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.360274f, 0.237862f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.246139f, 0.155726f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.121863f, 0.457051f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.097828f, 0.323258f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.163634f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.667648f, 0.168718f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.094870f, 0.132660f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.316878f, 0.455591f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.116917f, 0.098433f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.458816f, 0.325834f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.168403f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.172019f, 0.659578f},
+};
+
+// For each output (5x4) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_5x4[20][36] = {
+{0.773702f, 0.033711f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.192588f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.633422f, 0.166577f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.170080f, 0.029921f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.388335f, 0.403694f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.100996f, 0.106975f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.161122f, 0.655288f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.183590f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.801705f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.198295f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.400989f, 0.025097f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.573915f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.309345f, 0.085396f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.478694f, 0.126565f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.194664f, 0.187267f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.292735f, 0.308960f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.016375f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.098049f, 0.295983f, 0.000000f, 0.000000f, 0.017892f, 0.000000f, 0.111938f, 0.476138f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.043545f, 0.386448f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.570007f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.566407f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.402307f, 0.031286f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.463145f, 0.120696f, 0.000000f, 0.019497f, 0.000000f, 0.000000f, 0.311721f, 0.084942f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.296730f, 0.300781f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.204639f, 0.197849f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.122117f, 0.469302f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.102545f, 0.306036f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.562064f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.041534f, 0.396403f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.190134f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.773971f, 0.035896f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.169927f, 0.035812f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.630284f, 0.163977f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.112667f, 0.106813f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.393502f, 0.387018f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.177024f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.170482f, 0.652494f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.192274f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033039f, 0.774687f},
+};
+
+// For each output (6x4) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_6x4[24][36] = {
+{0.804254f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195746f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.804177f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195823f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.799585f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200415f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.803604f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.196396f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.807256f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.192744f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.805135f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.194865f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.410532f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.589468f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.408690f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.591310f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.416225f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.583775f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.414279f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.585721f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.406723f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.593277f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.402510f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.597490f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.584784f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.415216f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.590427f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.409573f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.590073f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.409927f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.580348f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.419652f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.588321f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.411679f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.587022f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.412978f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.193281f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.806719f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.189163f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.810837f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195108f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.804892f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.188290f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.811710f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.192914f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.807086f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195292f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.804708f},
+};
+
+// For each output (2x5) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_2x5[10][36] = {
+{0.387593f, 0.325123f, 0.221104f, 0.066180f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.065940f, 0.214659f, 0.326737f, 0.392664f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.309603f, 0.265953f, 0.168780f, 0.060600f, 0.000000f, 0.000000f, 0.084707f, 0.063017f, 0.047341f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062836f, 0.170767f, 0.261053f, 0.307978f, 0.000000f, 0.000000f, 0.000000f, 0.049286f, 0.064361f, 0.083719f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195787f, 0.153943f, 0.095706f, 0.042417f, 0.000000f, 0.000000f, 0.190695f, 0.154435f, 0.097288f, 0.040258f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.029471f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.017536f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.039307f, 0.094677f, 0.158696f, 0.199136f, 0.000000f, 0.000000f, 0.040959f, 0.093353f, 0.155294f, 0.201042f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.079432f, 0.065739f, 0.044876f, 0.000000f, 0.000000f, 0.000000f, 0.309205f, 0.264700f, 0.167247f, 0.068801f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.052112f, 0.064829f, 0.081363f, 0.000000f, 0.000000f, 0.064024f, 0.161136f, 0.263743f, 0.312793f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.393277f, 0.324792f, 0.213188f, 0.068743f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.066964f, 0.215440f, 0.323005f, 0.394591f},
+};
+
+// For each output (3x5) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_3x5[15][36] = {
+{0.620557f, 0.350797f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028646f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.110170f, 0.397489f, 0.386326f, 0.106015f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.357348f, 0.642652f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.503934f, 0.275289f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.128280f, 0.092497f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.102294f, 0.316223f, 0.313576f, 0.092518f, 0.000000f, 0.000000f, 0.000000f, 0.081158f, 0.094231f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.279079f, 0.502163f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.086083f, 0.132675f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.325483f, 0.157739f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.322567f, 0.172225f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021986f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063342f, 0.192228f, 0.186950f, 0.057021f, 0.000000f, 0.000000f, 0.054779f, 0.186114f, 0.185666f, 0.073901f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.172195f, 0.331802f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.148212f, 0.322038f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.025751f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.123726f, 0.081188f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.507339f, 0.287746f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.093924f, 0.094021f, 0.000000f, 0.000000f, 0.000000f, 0.097070f, 0.315697f, 0.314560f, 0.084728f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.082560f, 0.129771f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.277014f, 0.486817f, 0.023837f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.644191f, 0.355809f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.107771f, 0.387615f, 0.393454f, 0.111159f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.360886f, 0.639114f},
+};
+
+// For each output (4x5) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_4x5[20][36] = {
+{0.778254f, 0.190730f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.031016f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.401147f, 0.570243f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028610f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.563768f, 0.394241f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.041992f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.196238f, 0.767548f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.036214f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.637514f, 0.166734f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.167634f, 0.028118f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.322778f, 0.473312f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.085399f, 0.118511f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.471429f, 0.308185f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.118025f, 0.102361f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.176592f, 0.643933f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.179475f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.391609f, 0.100882f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.390531f, 0.116978f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.017259f, 0.000000f, 0.201618f, 0.301555f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.197600f, 0.281968f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.016735f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.293309f, 0.192842f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.268674f, 0.208109f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.020330f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.118514f, 0.380746f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.097621f, 0.381305f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021814f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.157977f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.657533f, 0.184490f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.097522f, 0.128585f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.309864f, 0.464029f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.128900f, 0.090864f, 0.000000f, 0.025393f, 0.000000f, 0.000000f, 0.464029f, 0.290814f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.024593f, 0.172268f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.173412f, 0.629727f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.029582f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.778816f, 0.191602f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.036297f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.394454f, 0.569249f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.039685f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.561207f, 0.399108f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.034683f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.193744f, 0.771574f},
+};
+
+// For each output (5x5) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_5x5[25][36] = {
+{1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.794727f, 0.205273f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.465125f, 0.484079f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028881f, 0.000000f, 0.000000f, 0.021914f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.192446f, 0.772941f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.034613f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033123f, 0.930510f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.036367f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.800234f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.199766f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.629079f, 0.165939f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.166390f, 0.019675f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.018918f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.378734f, 0.373861f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111597f, 0.135808f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.177492f, 0.641195f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.181313f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028722f, 0.761781f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.209497f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.475763f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.471882f, 0.029551f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.022804f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.382714f, 0.116167f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.383377f, 0.117742f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.254151f, 0.249987f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.241972f, 0.253891f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.017950f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.122722f, 0.376847f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.095099f, 0.369986f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.017396f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.029442f, 0.472507f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.471751f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.026300f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.190299f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.776924f, 0.032778f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.171498f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.666385f, 0.162117f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.125713f, 0.117624f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.387084f, 0.369579f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028493f, 0.169318f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.173770f, 0.628419f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.198951f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.035634f, 0.765415f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.963102f, 0.036898f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.030322f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.771054f, 0.198624f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021816f, 0.020944f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.481761f, 0.475479f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.032816f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.198418f, 0.768766f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033338f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966662f},
+};
+
+// For each output (6x5) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_6x5[30][36] = {
+{0.966284f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033716f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.966287f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033713f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.966287f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033713f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.966290f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033710f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966125f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033875f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966273f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033727f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.800857f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.199143f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.773463f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.201165f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.025372f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.805735f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.194265f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.788791f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.211209f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.785975f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.214025f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.787286f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.212714f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.490845f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.487242f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021913f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.490663f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.486878f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.022459f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.505452f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.494548f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.495383f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.482180f, 0.000000f, 0.022437f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.022727f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.496545f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.480728f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.486261f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.486387f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.027352f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.196272f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.803728f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.210059f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.789941f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.212947f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.787053f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.215261f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.784739f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.209116f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.790884f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.205881f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.794119f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033710f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966290f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033711f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966289f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033713f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966287f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033719f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966281f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033712f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966288f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033712f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966288f},
+};
+
+// For each output (2x6) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_2x6[12][36] = {
+{0.388815f, 0.325435f, 0.220189f, 0.065562f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.064515f, 0.214042f, 0.327700f, 0.393742f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398821f, 0.326200f, 0.217851f, 0.057128f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062546f, 0.216408f, 0.322269f, 0.398777f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.396575f, 0.330631f, 0.212857f, 0.059936f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.070253f, 0.215326f, 0.317576f, 0.396845f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398130f, 0.324745f, 0.213572f, 0.063553f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062009f, 0.216253f, 0.324683f, 0.397055f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.397646f, 0.321346f, 0.212334f, 0.068675f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.067073f, 0.210768f, 0.318165f, 0.403993f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.395756f, 0.325048f, 0.211862f, 0.067334f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.065475f, 0.214113f, 0.324009f, 0.396403f},
+};
+
+// For each output (3x6) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_3x6[18][36] = {
+{0.640136f, 0.359864f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.108112f, 0.399968f, 0.388087f, 0.103833f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.356122f, 0.643878f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.646308f, 0.353692f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.122937f, 0.390166f, 0.380558f, 0.106339f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.355015f, 0.644985f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.642874f, 0.357126f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111570f, 0.398638f, 0.387639f, 0.102153f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359134f, 0.640866f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.640159f, 0.359841f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.098908f, 0.393303f, 0.400421f, 0.107369f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.357119f, 0.642881f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.640541f, 0.359459f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.116318f, 0.397635f, 0.395084f, 0.090964f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.361948f, 0.638052f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.645448f, 0.354552f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.106981f, 0.389214f, 0.395056f, 0.108749f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359592f, 0.640408f},
+};
+
+// For each output (4x6) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_4x6[24][36] = {
+{0.806928f, 0.193072f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.412216f, 0.587784f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.590075f, 0.409925f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200682f, 0.799318f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.809822f, 0.190178f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.423474f, 0.576526f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.580816f, 0.419184f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.190240f, 0.809760f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.800320f, 0.199680f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.408625f, 0.591375f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.583392f, 0.416608f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200372f, 0.799628f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.798914f, 0.201086f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.411243f, 0.588757f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.586520f, 0.413480f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.203588f, 0.796412f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.802040f, 0.197960f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.411175f, 0.588825f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.599873f, 0.400127f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.193060f, 0.806940f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.806073f, 0.193927f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.408705f, 0.591295f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.585711f, 0.414289f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.197672f, 0.802328f},
+};
+
+// For each output (5x6) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_5x6[30][36] = {
+{0.966289f, 0.033711f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.794848f, 0.205152f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.473272f, 0.496525f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.030202f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.196955f, 0.803045f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033711f, 0.966289f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966284f, 0.033716f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.795787f, 0.204213f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.500928f, 0.499072f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.198603f, 0.801397f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033716f, 0.966284f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966283f, 0.033717f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.788424f, 0.211576f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.029276f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.484227f, 0.486497f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.201499f, 0.798501f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033724f, 0.966276f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966283f, 0.033717f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.791336f, 0.208664f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.490188f, 0.509812f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.204835f, 0.795165f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033703f, 0.966297f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966276f, 0.033724f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.799276f, 0.200724f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.022501f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.494443f, 0.483055f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.205967f, 0.794033f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033726f, 0.966274f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.965971f, 0.034029f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.798640f, 0.201360f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.502577f, 0.497423f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.203927f, 0.796073f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033706f, 0.966294f},
+};
+
+// For each output (6x6) sample, the weight of each input (6x6) sample.
+static const float g_weight_downsample_6x6_to_6x6[36][36] = {
+{1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f},
+{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f},
+};
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+const struct downsample_matrix_6x6
+{
+	uint32_t m_grid_width, m_grid_height;
+	const float* m_p;
+} g_downsample_matrices_6x6[] = {
+	{ 2, 2, (const float*)g_weight_downsample_6x6_to_2x2 },
+	{ 3, 2, (const float*)g_weight_downsample_6x6_to_3x2 },
+	{ 4, 2, (const float*)g_weight_downsample_6x6_to_4x2 },
+	{ 5, 2, (const float*)g_weight_downsample_6x6_to_5x2 },
+	{ 6, 2, (const float*)g_weight_downsample_6x6_to_6x2 },
+	{ 2, 3, (const float*)g_weight_downsample_6x6_to_2x3 },
+	{ 3, 3, (const float*)g_weight_downsample_6x6_to_3x3 },
+	{ 4, 3, (const float*)g_weight_downsample_6x6_to_4x3 },
+	{ 5, 3, (const float*)g_weight_downsample_6x6_to_5x3 },
+	{ 6, 3, (const float*)g_weight_downsample_6x6_to_6x3 },
+	{ 2, 4, (const float*)g_weight_downsample_6x6_to_2x4 },
+	{ 3, 4, (const float*)g_weight_downsample_6x6_to_3x4 },
+	{ 4, 4, (const float*)g_weight_downsample_6x6_to_4x4 },
+	{ 5, 4, (const float*)g_weight_downsample_6x6_to_5x4 },
+	{ 6, 4, (const float*)g_weight_downsample_6x6_to_6x4 },
+	{ 2, 5, (const float*)g_weight_downsample_6x6_to_2x5 },
+	{ 3, 5, (const float*)g_weight_downsample_6x6_to_3x5 },
+	{ 4, 5, (const float*)g_weight_downsample_6x6_to_4x5 },
+	{ 5, 5, (const float*)g_weight_downsample_6x6_to_5x5 },
+	{ 6, 5, (const float*)g_weight_downsample_6x6_to_6x5 },
+	{ 2, 6, (const float*)g_weight_downsample_6x6_to_2x6 },
+	{ 3, 6, (const float*)g_weight_downsample_6x6_to_3x6 },
+	{ 4, 6, (const float*)g_weight_downsample_6x6_to_4x6 },
+	{ 5, 6, (const float*)g_weight_downsample_6x6_to_5x6 },
+	{ 6, 6, (const float*)g_weight_downsample_6x6_to_6x6 }
+};
+//const uint32_t NUM_DOWNSAMPLE_MATRICES_6x6 = sizeof(g_downsample_matrices_6x6) / sizeof(g_downsample_matrices_6x6[0]);
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+const float* get_6x6_downsample_matrix(uint32_t grid_width, uint32_t grid_height)
+{
+	// TODO: Use hash or map lookup.
+	for (const auto& m : g_downsample_matrices_6x6)
+		if ((m.m_grid_width == grid_width) && (m.m_grid_height == grid_height))
+			return m.m_p;
+
+	assert(0);
+	return nullptr;
+}
+
+void downsample_weight_grid(
+	const float* pMatrix_weights,
+	uint32_t bx, uint32_t by,		// source/from dimension (block size)
+	uint32_t wx, uint32_t wy,		// dest/to dimension (grid size)
+	const uint8_t* pSrc_weights,	// these are dequantized weights, NOT ISE symbols, [by][bx]
+	uint8_t* pDst_weights)			// [wy][wx]
+{
+	const uint32_t total_block_samples = bx * by;
+
+	for (uint32_t y = 0; y < wy; y++)
+	{
+		for (uint32_t x = 0; x < wx; x++)
+		{
+			float total = 0.5f;
+
+			for (uint32_t i = 0; i < total_block_samples; i++)
+				if (pMatrix_weights[i])
+					total += pMatrix_weights[i] * (float)pSrc_weights[i];
+
+			pDst_weights[x + y * wx] = (uint8_t)clamp((int)total, 0, 64);
+
+			pMatrix_weights += total_block_samples;
+		}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+void downsample_ise_weights(
+	uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range,
+	uint32_t block_w, uint32_t block_h,
+	uint32_t grid_w, uint32_t grid_h,
+	const uint8_t* pSrc_weights, uint8_t* pDst_weights)
+{
+	assert((block_w <= MAX_ASTC_HDR_BLOCK_W) && (block_h <= MAX_ASTC_HDR_BLOCK_H));
+	assert((grid_w >= 2) && (grid_w <= MAX_ASTC_HDR_BLOCK_W));
+	assert((grid_h >= 2) && (grid_h <= MAX_ASTC_HDR_BLOCK_H));
+	
+	assert(dequant_weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE);
+	assert(dequant_weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE);
+
+	assert(quant_weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE);
+	assert(quant_weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE);
+
+	if ((block_w == grid_w) && (block_h == grid_h))
+	{
+		if (dequant_weight_ise_range != quant_weight_ise_range)
+		{
+			basist::astc_6x6_hdr::requantize_astc_weights(block_w * block_h, pSrc_weights, dequant_weight_ise_range, pDst_weights, quant_weight_ise_range);
+		}
+		else
+		{
+			if (pDst_weights != pSrc_weights)
+				memcpy(pDst_weights, pSrc_weights, block_w * block_h);
+		}
+
+		return;
+	}
+
+	uint8_t desired_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+
+	const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(dequant_weight_ise_range).m_ISE_to_val;
+
+	for (uint32_t by = 0; by < block_h; by++)
+		for (uint32_t bx = 0; bx < block_w; bx++)
+			desired_weights[bx + by * block_w] = dequant_tab[pSrc_weights[bx + by * block_w]];
+
+	uint8_t downsampled_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+
+	const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
+	assert(pDownsample_matrix);
+
+	downsample_weight_grid(
+		pDownsample_matrix,
+		block_w, block_h,		// source/from dimension (block size)
+		grid_w, grid_h,			// dest/to dimension (grid size)
+		desired_weights,		// these are dequantized weights, NOT ISE symbols, [by][bx]
+		downsampled_weights);	// [wy][wx]
+
+	const auto& weight_quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(quant_weight_ise_range).m_val_to_ise;
+
+	for (uint32_t gy = 0; gy < grid_h; gy++)
+		for (uint32_t gx = 0; gx < grid_w; gx++)
+			pDst_weights[gx + gy * grid_w] = weight_quant_tab[downsampled_weights[gx + gy * grid_w]];
+}
+
+void downsample_ise_weights_dual_plane(
+	uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range,
+	uint32_t block_w, uint32_t block_h,
+	uint32_t grid_w, uint32_t grid_h,
+	const uint8_t* pSrc_weights0, const uint8_t* pSrc_weights1,
+	uint8_t* pDst_weights)
+{
+	uint8_t downsampled_weights0[MAX_ASTC_HDR_BLOCK_W * MAX_ASTC_HDR_BLOCK_H], downsampled_weights1[MAX_ASTC_HDR_BLOCK_W * MAX_ASTC_HDR_BLOCK_H];
+
+	downsample_ise_weights(
+		dequant_weight_ise_range, quant_weight_ise_range,
+		block_w, block_h,
+		grid_w, grid_h,
+		pSrc_weights0, downsampled_weights0);
+
+	downsample_ise_weights(
+		dequant_weight_ise_range, quant_weight_ise_range,
+		block_w, block_h,
+		grid_w, grid_h,
+		pSrc_weights1, downsampled_weights1);
+
+	const uint32_t num_grid_samples = grid_w * grid_h;
+	for (uint32_t i = 0; i < num_grid_samples; i++)
+	{
+		pDst_weights[i * 2 + 0] = downsampled_weights0[i];
+		pDst_weights[i * 2 + 1] = downsampled_weights1[i];
+	}
+}
+
+static bool refine_endpoints_mode11(
+	uint32_t endpoint_ise_range,
+	uint8_t* pEndpoint_vals, // the endpoints to optimize
+	uint32_t block_w, uint32_t block_h, // block dimensions
+	uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid
+	uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+	const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets
+	astc_hdr_codec_base_options& coptions,
+	bool direct_only, int first_submode, int last_submode,
+	opt_mode_t opt_mode)
+{
+	if (opt_mode == cNoOpt)
+		return false;
+
+	const uint32_t num_block_pixels = block_w * block_h;
+
+	uint8_t def_pixel_block_ofs[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	if (!pPixel_block_ofs)
+	{
+		for (uint32_t i = 0; i < num_block_pixels; i++)
+			def_pixel_block_ofs[i] = (uint8_t)i;
+		
+		pPixel_block_ofs = def_pixel_block_ofs;
+	}
+
+	const uint32_t num_weights = grid_w * grid_h;
+
+	uint8_t dequantized_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	for (uint32_t i = 0; i < num_weights; i++)
+		dequantized_raw_weights[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[pWeights[i]];
+
+	uint8_t upsampled_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE
+	astc_helpers::upsample_weight_grid(block_w, block_h, grid_w, grid_h, dequantized_raw_weights, upsampled_weights);
+
+	aabb3F color_box_q16(cInitExpand);
+
+	uint8_t trial_blk_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE
+	float trial_blk_raw_weightsf[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		color_box_q16.expand(pBlock_pixels_q16[i]);
+
+		assert(pPixel_block_ofs[i] < num_block_pixels);
+
+		trial_blk_raw_weights[i] = upsampled_weights[pPixel_block_ofs[i]];
+		trial_blk_raw_weightsf[i] = (float)trial_blk_raw_weights[i] * (1.0f / 64.0f);
+	}
+	
+	vec3F l_q16, h_q16;
+	if (opt_mode == cOrdinaryLeastSquares)
+	{
+		if (!compute_least_squares_endpoints_rgb_raw_weights(num_pixels, trial_blk_raw_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16))
+			return false;
+	}
+	else if ((opt_mode == cWeightedLeastSquares) || (opt_mode == cWeightedLeastSquaresHeavy))
+	{
+		vec3F block_mean_color_q16(calc_mean(num_pixels, pBlock_pixels_q16));
+		vec3F block_axis_q16(calc_rgb_pca(num_pixels, pBlock_pixels_q16, block_mean_color_q16));
+		float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL;
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16);
+			float kd = k.dot(block_axis_q16);
+			if (kd < l)
+				l = kd;
+			if (kd > h)
+				h = kd;
+		}
+		float emphasis_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+		if (h == l)
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+				emphasis_weights[i] = 1.0f;
+		}
+		else
+		{
+			float mid = (0.0f - l) / (h - l);
+			mid = clamp(mid, .01f, .99f);
+				
+			float lw = LOW_EMPHASIS_WEIGHT, mw = MIDDLE_EMPHASIS_WEIGHT, hw = HIGH_EMPHASIS_WEIGHT;
+			if (opt_mode == cWeightedLeastSquaresHeavy)
+				lw = LOW_EMPHASIS_WEIGHT_HEAVY, mw = MIDDLE_EMPHASIS_WEIGHT_HEAVY, hw = HIGH_EMPHASIS_WEIGHT_HEAVY;
+
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16);
+				float kd = k.dot(block_axis_q16);
+
+				assert((kd >= l) && (kd <= h));
+
+				float v = (kd - l) / (h - l);
+
+				if (v < mid)
+					v = lerp(lw, mw, v / mid);
+				else
+					v = lerp(mw, hw, (v - mid) * (1.0f - mid));
+
+				emphasis_weights[i] = v;
+			}
+		}
+
+		if (!compute_weighted_least_squares_endpoints_rgb(num_pixels, nullptr, nullptr, trial_blk_raw_weightsf, emphasis_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16))
+			return false;
+	}
+	else
+	{
+		assert(opt_mode == cWeightedAverage);
+
+		l_q16.set(0.0f);
+		float total_low = 0.0f;
+
+		h_q16.set(0.0f);
+		float total_high = 0.0f;
+
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			vec3F p(pBlock_pixels_q16[i]);
+			float lerp = (float)trial_blk_raw_weights[i] * (1.0f / 64.0f);
+
+			l_q16 += p * (1.0f - lerp);
+			total_low += (1.0f - lerp);
+
+			h_q16 += p * lerp;
+			total_high += lerp;
+		}
+
+		if (total_low != 0.0f)
+			l_q16 *= (1.0f / total_low);
+		else
+			return false;
+
+		if (total_high != 0.0f)
+			h_q16 *= (1.0f / total_high);
+		else
+			return false;
+	}
+
+	uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS];
+	
+	uint32_t submode_used;
+
+	bool pack_succeeded = pack_mode11(l_q16, h_q16, endpoint_ise_range, trial_endpoints, coptions, direct_only, first_submode, last_submode, false, submode_used);
+	if (!pack_succeeded)
+		return false;
+
+	int cur_e[2][3];
+	if (!decode_mode11_to_qlog12(pEndpoint_vals, cur_e, endpoint_ise_range))
+		return false;
+
+	int trial_e[2][3];
+	if (!decode_mode11_to_qlog12(trial_endpoints, trial_e, endpoint_ise_range))
+		return false;
+
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		cur_e[0][i] <<= 4;
+		cur_e[1][i] <<= 4;
+
+		trial_e[0][i] <<= 4;
+		trial_e[1][i] <<= 4;
+	}
+
+	const float R_WEIGHT = coptions.m_r_err_scale, G_WEIGHT = coptions.m_g_err_scale;
+
+	double cur_error = 0, trial_error = 0;
+		
+	for (uint32_t p = 0; p < num_pixels; p++)
+	{
+		const half_float* pDesired_half = &pBlock_pixels_half[p][0];
+
+		const double desired_half_r_q = q(pDesired_half[0], coptions.m_q_log_bias), desired_half_g_q = q(pDesired_half[1], coptions.m_q_log_bias), desired_half_b_q = q(pDesired_half[2], coptions.m_q_log_bias);
+
+		const uint32_t c = trial_blk_raw_weights[p];
+		assert(c <= 64);
+
+		{
+			half_float rf, gf, bf;
+
+			{
+				uint32_t r0 = cur_e[0][0], r1 = cur_e[1][0];
+				int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
+				rf = astc_helpers::qlog16_to_half(ri);
+			}
+
+			{
+				uint32_t g0 = cur_e[0][1], g1 = cur_e[1][1];
+				int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
+				gf = astc_helpers::qlog16_to_half(gi);
+			}
+
+			{
+				uint32_t b0 = cur_e[0][2], b1 = cur_e[1][2];
+				int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
+				bf = astc_helpers::qlog16_to_half(bi);
+			}
+
+			const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias);
+
+			const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q;
+
+			cur_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+		}
+
+		{
+			half_float rf, gf, bf;
+
+			{
+				uint32_t r0 = trial_e[0][0], r1 = trial_e[1][0];
+				int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
+				rf = astc_helpers::qlog16_to_half(ri);
+			}
+
+			{
+				uint32_t g0 = trial_e[0][1], g1 = trial_e[1][1];
+				int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
+				gf = astc_helpers::qlog16_to_half(gi);
+			}
+
+			{
+				uint32_t b0 = trial_e[0][2], b1 = trial_e[1][2];
+				int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
+				bf = astc_helpers::qlog16_to_half(bi);
+			}
+
+			const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias);
+
+			const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q;
+
+			trial_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+		}
+
+	} // p
+
+	if (trial_error < cur_error)
+	{
+		memcpy(pEndpoint_vals, trial_endpoints, NUM_MODE11_ENDPOINTS);
+		return true;
+	}
+
+	return false;
+}
+
+static bool refine_endpoints_mode7(
+	uint32_t endpoint_ise_range,
+	uint8_t* pEndpoint_vals, // the endpoints to optimize
+	uint32_t block_w, uint32_t block_h, // block dimensions
+	uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid
+	uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+	const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets
+	astc_hdr_codec_base_options& coptions,
+	int first_submode, int last_submode)
+{
+	const uint32_t num_block_pixels = block_w * block_h;
+
+	uint8_t def_pixel_block_ofs[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	if (!pPixel_block_ofs)
+	{
+		for (uint32_t i = 0; i < num_block_pixels; i++)
+			def_pixel_block_ofs[i] = (uint8_t)i;
+
+		pPixel_block_ofs = def_pixel_block_ofs;
+	}
+
+	const uint32_t num_weights = grid_w * grid_h;
+
+	uint8_t dequantized_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS];
+	for (uint32_t i = 0; i < num_weights; i++)
+		dequantized_raw_weights[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[pWeights[i]];
+
+	uint8_t upsampled_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE
+	astc_helpers::upsample_weight_grid(block_w, block_h, grid_w, grid_h, dequantized_raw_weights, upsampled_weights);
+
+	uint8_t trial_blk_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		assert(pPixel_block_ofs[i] < num_block_pixels);
+
+		trial_blk_raw_weights[i] = upsampled_weights[pPixel_block_ofs[i]];
+	}
+
+	//--
+
+	int cur_e[2][3];
+	int cur_s = 0;
+	if (!decode_mode7_to_qlog12(pEndpoint_vals, cur_e, &cur_s, endpoint_ise_range))
+		return false;
+
+	cur_s <<= 4;
+
+	vec3F block_mean_color_q16(calc_mean(num_pixels, pBlock_pixels_q16));
+
+	vec3F new_high_color_q16(block_mean_color_q16);
+		
+	const float one_over_num_pixels = 1.0f / (float)num_pixels;
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		float lerp = trial_blk_raw_weights[i] * (1.0f / 64.0f);
+
+		float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels;
+		new_high_color_q16[0] += k;
+		new_high_color_q16[1] += k;
+		new_high_color_q16[2] += k;
+	}
+					
+	// Given a set of selectors and a high color, try to compute a better S.
+	float t = 0.0f;
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		float lerp = trial_blk_raw_weights[i] * (1.0f / 64.0f);
+
+		t += (1.0f) - lerp;
+	}
+
+	t *= one_over_num_pixels;
+	
+	if (fabs(t) < .0000125f)
+		return false;
+
+	uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS];
+
+	uint32_t submode_used;
+	if (!pack_mode7(new_high_color_q16, (float)cur_s, endpoint_ise_range, trial_endpoints, weight_ise_range, coptions, first_submode, last_submode, false, submode_used))
+		return false;
+
+	int trial_e[2][3];
+	if (!decode_mode7_to_qlog12(trial_endpoints, trial_e, nullptr, endpoint_ise_range))
+		return false;
+
+	vec3F cur_h_q16((float)(trial_e[1][0] << 4), (float)(trial_e[1][1] << 4), (float)(trial_e[1][2] << 4));
+
+	float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t;
+	//float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t;
+	//float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t;
+	float new_s_q16 = ceilf(s_r);
+
+	if (!pack_mode7(new_high_color_q16, new_s_q16, endpoint_ise_range, trial_endpoints, weight_ise_range, coptions, first_submode, last_submode, false, submode_used))
+		return false;
+
+	if (!decode_mode7_to_qlog12(trial_endpoints, trial_e, nullptr, endpoint_ise_range))
+		return false;
+	
+	// --
+
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		cur_e[0][i] <<= 4;
+		cur_e[1][i] <<= 4;
+
+		trial_e[0][i] <<= 4;
+		trial_e[1][i] <<= 4;
+	}
+
+	const float R_WEIGHT = coptions.m_r_err_scale, G_WEIGHT = coptions.m_g_err_scale;
+
+	double cur_error = 0, trial_error = 0;
+
+	for (uint32_t p = 0; p < num_pixels; p++)
+	{
+		const half_float* pDesired_half = &pBlock_pixels_half[p][0];
+
+		const double desired_half_r_q = q(pDesired_half[0], coptions.m_q_log_bias), desired_half_g_q = q(pDesired_half[1], coptions.m_q_log_bias), desired_half_b_q = q(pDesired_half[2], coptions.m_q_log_bias);
+
+		const uint32_t c = trial_blk_raw_weights[p];
+		assert(c <= 64);
+
+		{
+			half_float rf, gf, bf;
+
+			{
+				uint32_t r0 = cur_e[0][0], r1 = cur_e[1][0];
+				int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
+				rf = astc_helpers::qlog16_to_half(ri);
+			}
+
+			{
+				uint32_t g0 = cur_e[0][1], g1 = cur_e[1][1];
+				int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
+				gf = astc_helpers::qlog16_to_half(gi);
+			}
+
+			{
+				uint32_t b0 = cur_e[0][2], b1 = cur_e[1][2];
+				int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
+				bf = astc_helpers::qlog16_to_half(bi);
+			}
+
+			const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias);
+
+			const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q;
+
+			cur_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+		}
+
+		{
+			half_float rf, gf, bf;
+
+			{
+				uint32_t r0 = trial_e[0][0], r1 = trial_e[1][0];
+				int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
+				rf = astc_helpers::qlog16_to_half(ri);
+			}
+
+			{
+				uint32_t g0 = trial_e[0][1], g1 = trial_e[1][1];
+				int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
+				gf = astc_helpers::qlog16_to_half(gi);
+			}
+
+			{
+				uint32_t b0 = trial_e[0][2], b1 = trial_e[1][2];
+				int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
+				bf = astc_helpers::qlog16_to_half(bi);
+			}
+
+			const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias);
+
+			const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q;
+
+			trial_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+		}
+
+	} // p
+
+	if (trial_error < cur_error)
+	{
+		memcpy(pEndpoint_vals, trial_endpoints, NUM_MODE7_ENDPOINTS);
+		return true;
+	}
+
+	return false;
+}
+
+bool refine_endpoints(
+	uint32_t cem,
+	uint32_t endpoint_ise_range,
+	uint8_t* pEndpoint_vals, // the endpoints to optimize
+	uint32_t block_w, uint32_t block_h, // block dimensions
+	uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid
+	uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+	const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets
+	astc_hdr_codec_base_options& coptions, opt_mode_t opt_mode)
+{
+	if (cem == 7)
+	{
+		return refine_endpoints_mode7(
+			endpoint_ise_range,
+			pEndpoint_vals,
+			block_w, block_h,
+			grid_w, grid_h, pWeights, weight_ise_range,
+			num_pixels, pBlock_pixels_half, pBlock_pixels_q16,
+			pPixel_block_ofs,
+			coptions,
+			FIRST_MODE7_SUBMODE_INDEX, MAX_MODE7_SUBMODE_INDEX);
+	}
+	else if (cem == 11)
+	{
+		return refine_endpoints_mode11(
+			endpoint_ise_range,
+			pEndpoint_vals,
+			block_w, block_h,
+			grid_w, grid_h, pWeights, weight_ise_range,
+			num_pixels, pBlock_pixels_half, pBlock_pixels_q16,
+			pPixel_block_ofs,
+			coptions,
+			false, FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, opt_mode);
+	}
+
+	return false;
+}
+
+} // namespace basisu
+

+ 423 - 0
thirdparty/basis_universal/encoder/basisu_astc_hdr_common.h

@@ -0,0 +1,423 @@
+// File: basisu_astc_hdr_common.h
+#pragma once
+#include "basisu_enc.h"
+#include "basisu_gpu_texture.h"
+#include "../transcoder/basisu_astc_helpers.h"
+#include "../transcoder/basisu_astc_hdr_core.h"
+
+namespace basisu
+{
+	const uint32_t MAX_ASTC_HDR_BLOCK_W = 6, MAX_ASTC_HDR_BLOCK_H = 6;
+	const uint32_t MAX_ASTC_HDR_ENC_BLOCK_PIXELS = 6 * 6;
+
+	const uint32_t MODE11_TOTAL_SUBMODES = 8; // plus an extra hidden submode, directly encoded, for direct, so really 9 (see tables 99/100 of the ASTC spec)
+	const uint32_t MODE7_TOTAL_SUBMODES = 6;
+		
+	// [ise_range][0] = # levels
+	// [ise_range][1...] = lerp value [0,64]
+	// in ASTC order
+	// Supported ISE weight ranges: 0 to 11, 12 total
+	const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = astc_helpers::BISE_2_LEVELS; // ISE 0=2 levels
+	const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = astc_helpers::BISE_32_LEVELS; // ISE 11=16 levels
+	const uint32_t MIN_SUPPORTED_WEIGHT_LEVELS = 2;
+	const uint32_t MAX_SUPPORTED_WEIGHT_LEVELS = 32;
+
+	extern const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][33];
+
+	const float Q_LOG_BIAS_4x4 = .125f; // the original UASTC HDR 4x4 log bias
+	const float Q_LOG_BIAS_6x6 = 1.0f; // the log bias both encoders use now
+
+	const float LDR_TO_HDR_NITS = 100.0f;
+
+	struct astc_hdr_codec_base_options
+	{
+		float m_r_err_scale, m_g_err_scale;
+		float m_q_log_bias;
+		
+		bool m_ultra_quant;
+		
+		// If true, the ASTC HDR compressor is allowed to more aggressively vary weight indices for slightly higher compression in non-fastest mode. This will hurt BC6H quality, however.
+		bool m_allow_uber_mode;
+
+		bool m_mode7_full_s_optimization;
+
+		bool m_take_first_non_clamping_mode11_submode;
+		bool m_take_first_non_clamping_mode7_submode;
+
+		bool m_disable_weight_plane_optimization;
+		
+		astc_hdr_codec_base_options() { init(); }
+
+		void init();
+	};
+
+	inline int get_bit(
+		int src_val, int src_bit)
+	{
+		assert(src_bit >= 0 && src_bit <= 31);
+		int bit = (src_val >> src_bit) & 1;
+		return bit;
+	}
+
+	inline void pack_bit(
+		int& dst, int dst_bit,
+		int src_val, int src_bit = 0)
+	{
+		assert(dst_bit >= 0 && dst_bit <= 31);
+		int bit = get_bit(src_val, src_bit);
+		dst |= (bit << dst_bit);
+	}
+
+	inline uint32_t get_max_qlog(uint32_t bits)
+	{
+		switch (bits)
+		{
+		case 7: return basist::MAX_QLOG7;
+		case 8: return basist::MAX_QLOG8;
+		case 9: return basist::MAX_QLOG9;
+		case 10: return basist::MAX_QLOG10;
+		case 11: return basist::MAX_QLOG11;
+		case 12: return basist::MAX_QLOG12;
+		case 16: return basist::MAX_QLOG16;
+		default: assert(0); break;
+		}
+		return 0;
+	}
+
+#if 0
+	inline float get_max_qlog_val(uint32_t bits)
+	{
+		switch (bits)
+		{
+		case 7: return MAX_QLOG7_VAL;
+		case 8: return MAX_QLOG8_VAL;
+		case 9: return MAX_QLOG9_VAL;
+		case 10: return MAX_QLOG10_VAL;
+		case 11: return MAX_QLOG11_VAL;
+		case 12: return MAX_QLOG12_VAL;
+		case 16: return MAX_QLOG16_VAL;
+		default: assert(0); break;
+		}
+		return 0;
+	}
+#endif
+
+#if 0
+	// Input is the low 11 bits of the qlog
+	// Returns the 10-bit mantissa of the half float value
+	int qlog11_to_half_float_mantissa(int M)
+	{
+		assert(M <= 0x7FF);
+		int Mt;
+		if (M < 512)
+			Mt = 3 * M;
+		else if (M >= 1536)
+			Mt = 5 * M - 2048;
+		else
+			Mt = 4 * M - 512;
+		return (Mt >> 3);
+	}
+#endif
+
+	// Input is the 10-bit mantissa of the half float value
+	// Output is the 11-bit qlog value
+	// Inverse of qlog11_to_half_float_mantissa()
+	inline int half_float_mantissa_to_qlog11(int hf)
+	{
+		int q0 = (hf * 8 + 2) / 3;
+		int q1 = (hf * 8 + 2048 + 4) / 5;
+
+		if (q0 < 512)
+			return q0;
+		else if (q1 >= 1536)
+			return q1;
+
+		int q2 = (hf * 8 + 512 + 2) / 4;
+		return q2;
+	}
+
+	inline int half_to_qlog16(int hf)
+	{
+		assert(!basist::half_is_signed((basist::half_float)hf) && !basist::is_half_inf_or_nan((basist::half_float)hf));
+
+		// extract 5 bits exponent, which is carried through to qlog16 unchanged
+		const int exp = (hf >> 10) & 0x1F;
+
+		// extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless)
+		const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF);
+		assert(mantissa <= 0x7FF);
+
+		// Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights.
+		uint32_t qlog16 = (exp << 11) | mantissa;
+
+		// should be a lossless operation
+		assert(astc_helpers::qlog16_to_half(qlog16) == hf);
+
+		return qlog16;
+	}
+
+	void interpolate_qlog12_colors(
+		const int e[2][3],
+		basist::half_float* pDecoded_half,
+		vec3F* pDecoded_float,
+		uint32_t n, uint32_t ise_weight_range);
+
+	bool get_astc_hdr_mode_11_block_colors(
+		const uint8_t* pEndpoints,
+		basist::half_float* pDecoded_half,
+		vec3F* pDecoded_float,
+		uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);
+
+	bool get_astc_hdr_mode_7_block_colors(
+		const uint8_t* pEndpoints,
+		basist::half_float* pDecoded_half,
+		vec3F* pDecoded_float,
+		uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);
+			
+	// Fast high precision piecewise linear approximation of log2(bias+x).
+	// Half may be zero, positive or denormal. No NaN/Inf/negative.
+	BASISU_FORCE_INLINE double q(basist::half_float x, float log_bias)
+	{
+		union { float f; int32_t i; uint32_t u; } fi;
+
+		fi.f = fast_half_to_float_pos_not_inf_or_nan(x);
+
+		assert(fi.f >= 0.0f);
+						
+		fi.f += log_bias;
+
+		return (double)fi.u; // approx log2f(fi.f), need to return double for the precision
+	}
+
+	BASISU_FORCE_INLINE uint32_t q2(basist::half_float x, float log_bias)
+	{
+		union { float f; int32_t i; uint32_t u; } fi;
+
+		fi.f = fast_half_to_float_pos_not_inf_or_nan(x);
+
+		assert(fi.f >= 0.0f);
+		
+		fi.f += log_bias;
+
+		return fi.u;
+	}
+
+	double eval_selectors(
+		uint32_t num_pixels,
+		uint8_t* pWeights,
+		uint32_t ise_weight_range,
+		const basist::half_float* pBlock_pixels_half,
+		uint32_t num_weight_levels,
+		const basist::half_float* pDecoded_half,
+		const astc_hdr_codec_base_options& coptions,
+		uint32_t usable_selector_bitmask = UINT32_MAX);
+
+	double eval_selectors_dual_plane(
+		uint32_t channel_index,
+		uint32_t num_pixels,
+		uint8_t* pWeights0, uint8_t* pWeights1,
+		const basist::half_float* pBlock_pixels_half,
+		uint32_t num_weight_levels,
+		const basist::half_float* pDecoded_half,
+		const astc_hdr_codec_base_options& coptions,
+		uint32_t usable_selector_bitmask = UINT32_MAX);
+
+	double compute_block_error(uint32_t num_pixels, const basist::half_float* pOrig_block, const basist::half_float* pPacked_block, const astc_hdr_codec_base_options& coptions);
+
+	const uint32_t FIRST_MODE7_SUBMODE_INDEX = 0;
+	const uint32_t MAX_MODE7_SUBMODE_INDEX = 5;
+
+	bool pack_mode7(
+		const vec3F& high_color_q16, const float s_q16,
+		uint32_t ise_endpoint_range, uint8_t* pEndpoints,
+		uint32_t ise_weight_range, // only used for determining biasing during CEM 7 packing
+		const astc_hdr_codec_base_options& coptions,
+		int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used);
+
+	bool try_mode7(
+		uint32_t num_pixels,
+		uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
+		const vec3F& high_color_q16, const float s_q16,
+		const basist::half_float block_pixels_half[][3],
+		uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions,
+		uint32_t ise_endpoint_range,
+		int32_t first_submode = 0, int32_t last_submode = MAX_MODE7_SUBMODE_INDEX);
+
+	bool pack_mode11(
+		const vec3F& low_color_q16, const vec3F& high_color_q16,
+		uint32_t ise_endpoint_range, uint8_t* pEndpoints,
+		const astc_hdr_codec_base_options& coptions,
+		bool direct_only, int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used);
+
+	bool try_mode11(uint32_t num_pixels,
+		uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
+		const vec3F& low_color_q16, const vec3F& high_color_q16,
+		const basist::half_float block_pixels_half[][3],
+		uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
+		bool constrain_ise_weight_selectors,
+		int32_t first_submode, int32_t last_submode, bool ignore_clamping);
+
+	bool try_mode11_dual_plane(uint32_t channel_index, uint32_t num_pixels,
+		uint8_t* pEndpoints, uint8_t* pWeights0, uint8_t* pWeights1, double& cur_block_error, uint32_t& submode_used,
+		const vec3F& low_color_q16, const vec3F& high_color_q16,
+		const basist::half_float block_pixels_half[][3],
+		uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
+		bool constrain_ise_weight_selectors,
+		int32_t first_submode, int32_t last_submode, bool ignore_clamping);
+
+	const int FIRST_MODE11_SUBMODE_INDEX = -1;
+	const int MAX_MODE11_SUBMODE_INDEX = 7;
+
+	enum opt_mode_t
+	{
+		cNoOpt,
+		cOrdinaryLeastSquares,
+		cWeightedLeastSquares,
+		cWeightedLeastSquaresHeavy,
+		cWeightedAverage
+	};
+
+	struct encode_astc_block_stats
+	{
+		uint32_t m_num_pixels;
+		vec3F m_mean_q16;
+		vec3F m_axis_q16;
+
+		void init(uint32_t num_pixels, const vec4F pBlock_pixels_q16[]);
+	};
+
+	double encode_astc_hdr_block_mode_11(
+		uint32_t num_pixels,
+		const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+		uint32_t ise_weight_range,
+		uint32_t& best_submode,
+		double cur_block_error,
+		uint8_t* blk_endpoints, uint8_t* blk_weights,
+		const astc_hdr_codec_base_options& coptions,
+		bool direct_only,
+		uint32_t ise_endpoint_range,
+		bool uber_mode,
+		bool constrain_ise_weight_selectors,
+		int32_t first_submode, int32_t last_submode, bool ignore_clamping, 
+		opt_mode_t opt_mode, 
+		const encode_astc_block_stats *pBlock_stats = nullptr);
+
+	double encode_astc_hdr_block_downsampled_mode_11(
+		uint32_t block_x, uint32_t block_y, uint32_t grid_x, uint32_t grid_y,
+		uint32_t ise_weight_range, uint32_t ise_endpoint_range,
+		uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+		double cur_block_error,
+		int32_t first_submode, int32_t last_submode, bool ignore_clamping, opt_mode_t opt_mode,
+		uint8_t* pBlk_endpoints, uint8_t* pBlk_weights, uint32_t& best_submode,
+		const astc_hdr_codec_base_options& coptions,
+		const encode_astc_block_stats* pBlock_stats = nullptr);
+
+	double encode_astc_hdr_block_mode_11_dual_plane(
+		uint32_t num_pixels,
+		const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+		uint32_t channel_index,		// 0-2
+		uint32_t ise_weight_range,
+		uint32_t& best_submode,
+		double cur_block_error,
+		uint8_t* blk_endpoints, uint8_t* blk_weights0, uint8_t* blk_weights1,
+		const astc_hdr_codec_base_options& coptions,
+		bool direct_only,
+		uint32_t ise_endpoint_range,
+		bool uber_mode,
+		bool constrain_ise_weight_selectors,
+		int32_t first_submode, int32_t last_submode, 
+		bool ignore_clamping);
+
+	double encode_astc_hdr_block_mode_7(
+		uint32_t num_pixels,
+		const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+		uint32_t ise_weight_range,
+		uint32_t& best_submode,
+		double cur_block_error,
+		uint8_t* blk_endpoints,  //[4]
+		uint8_t* blk_weights, // [num_pixels]
+		const astc_hdr_codec_base_options& coptions,
+		uint32_t ise_endpoint_range, 
+		int first_submode = 0, int last_submode = MAX_MODE7_SUBMODE_INDEX, 
+		const encode_astc_block_stats *pBlock_stats = nullptr);
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	struct mode11_log_desc
+	{
+		int32_t m_submode;
+		int32_t m_maj_comp;
+
+		// Or R0, G0, B0 if maj_comp==3 (direct)
+		int32_t m_a;  // positive
+		int32_t m_c;  // positive
+		int32_t m_b0; // positive
+
+		// Or R1, G1, B1 if maj_comp==3 (direct)
+		int32_t m_b1; // positive
+		int32_t m_d0; // if not direct, is signed
+		int32_t m_d1; // if not direct, is signed
+
+		// limits if not direct
+		int32_t m_a_bits, m_c_bits, m_b_bits, m_d_bits;
+		int32_t m_max_a_val, m_max_c_val, m_max_b_val, m_min_d_val, m_max_d_val;
+
+		void clear() { clear_obj(*this); }
+
+		bool is_direct() const { return m_maj_comp == 3; }
+	};
+
+	//--------------------------------------------------------------------------------------------------------------------------
+	bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range, bool early_out_if_clamped, int max_clamp_mag_accept_thresh);
+
+	bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, int val_q[2][3], int& max_clamp_mag, bool early_out_if_clamped = false, int max_clamp_mag_accept_thresh = 0);
+	bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag, bool early_out_if_clamped = false, int max_clamp_mag_accept_thresh = 0);
+	void pack_astc_mode11_direct(uint8_t* pEndpoints, vec3F l_q16, vec3F h_q16);
+	
+	bool pack_mode11(mode11_log_desc& desc, uint8_t* pEndpoints);
+	void unpack_mode11(const uint8_t* pEndpoints, mode11_log_desc& desc);
+
+	void decode_cem_11_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index);
+	void decode_cem_7_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index);
+		
+	void dequantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights);
+
+	const float* get_6x6_downsample_matrix(uint32_t grid_width, uint32_t grid_height);
+	
+	void downsample_weight_grid(
+		const float* pMatrix_weights,
+		uint32_t bx, uint32_t by,		// source/from dimension (block size)
+		uint32_t wx, uint32_t wy,		// dest/to dimension (grid size)
+		const uint8_t* pSrc_weights,	// these are dequantized weights, NOT ISE symbols, [by][bx]
+		uint8_t* pDst_weights);			// [wy][wx]
+
+	void downsample_ise_weights(
+		uint32_t weight_ise_range, uint32_t quant_weight_ise_range,
+		uint32_t block_w, uint32_t block_h,
+		uint32_t grid_w, uint32_t grid_h,
+		const uint8_t* pSrc_weights, uint8_t* pDst_weights);
+
+	void downsample_ise_weights_dual_plane(
+		uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range,
+		uint32_t block_w, uint32_t block_h,
+		uint32_t grid_w, uint32_t grid_h,
+		const uint8_t* pSrc_weights0, const uint8_t* pSrc_weights1,
+		uint8_t* pDst_weights);
+
+	bool refine_endpoints(
+		uint32_t cem,
+		uint32_t endpoint_ise_range,
+		uint8_t* pEndpoint_vals, // the endpoints to optimize
+		uint32_t block_w, uint32_t block_h, // block dimensions
+		uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid
+		uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
+		const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets
+		astc_hdr_codec_base_options& coptions, opt_mode_t opt_mode);
+	
+	extern bool g_astc_hdr_enc_initialized;
+
+	// This MUST be called before encoding any blocks.
+	void astc_hdr_enc_init();
+
+} // namespace basisu
+

+ 0 - 3310
thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp

@@ -1,3310 +0,0 @@
-// basisu_astc_hdr_enc.cpp
-#include "basisu_astc_hdr_enc.h"
-#include "../transcoder/basisu_transcoder.h"
-
-using namespace basist;
-
-namespace basisu
-{
-
-const float DEF_R_ERROR_SCALE = 2.0f;
-const float DEF_G_ERROR_SCALE = 3.0f;
-
-static inline uint32_t get_max_qlog(uint32_t bits)
-{
-	switch (bits)
-	{
-	case 7: return MAX_QLOG7;
-	case 8: return MAX_QLOG8;
-	case 9: return MAX_QLOG9;
-	case 10: return MAX_QLOG10;
-	case 11: return MAX_QLOG11;
-	case 12: return MAX_QLOG12;
-	case 16: return MAX_QLOG16;
-	default: assert(0); break;
-	}
-	return 0;
-}
-
-#if 0
-static inline float get_max_qlog_val(uint32_t bits)
-{
-	switch (bits)
-	{
-	case 7: return MAX_QLOG7_VAL;
-	case 8: return MAX_QLOG8_VAL;
-	case 9: return MAX_QLOG9_VAL;
-	case 10: return MAX_QLOG10_VAL;
-	case 11: return MAX_QLOG11_VAL;
-	case 12: return MAX_QLOG12_VAL;
-	case 16: return MAX_QLOG16_VAL;
-	default: assert(0); break;
-	}
-	return 0;
-}
-#endif
-
-static inline int get_bit(
-	int src_val, int src_bit)
-{
-	assert(src_bit >= 0 && src_bit <= 31);
-	int bit = (src_val >> src_bit) & 1;
-	return bit;
-}
-
-static inline void pack_bit(
-	int& dst, int dst_bit,
-	int src_val, int src_bit = 0)
-{
-	assert(dst_bit >= 0 && dst_bit <= 31);
-	int bit = get_bit(src_val, src_bit);
-	dst |= (bit << dst_bit);
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-astc_hdr_codec_options::astc_hdr_codec_options()
-{
-	init();
-}
-
-void astc_hdr_codec_options::init()
-{
-	m_bc6h_err_weight = .85f;
-	m_r_err_scale = DEF_R_ERROR_SCALE;
-	m_g_err_scale = DEF_G_ERROR_SCALE;
-
-	// Disabling by default to avoid transcoding outliers (try kodim26). The quality lost is very low. TODO: Could include the uber result in the output.
-	m_allow_uber_mode = false;
-
-	// Must set best quality level first to set defaults.
-	set_quality_best();
-
-	set_quality_level(cDefaultLevel);
-}
-
-void astc_hdr_codec_options::set_quality_best()
-{
-	m_mode11_direct_only = false;
-		
-	// highest achievable quality
-	m_use_solid = true;
-
-	m_use_mode11 = true;
-	m_mode11_uber_mode = true;
-	m_first_mode11_weight_ise_range = MODE11_FIRST_ISE_RANGE;
-	m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
-	m_first_mode11_submode = -1;
-	m_last_mode11_submode = 7;
-
-	m_use_mode7_part1 = true;
-	m_first_mode7_part1_weight_ise_range = MODE7_PART1_FIRST_ISE_RANGE;
-	m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
-
-	m_use_mode7_part2 = true;
-	m_mode7_part2_part_masks = UINT32_MAX;
-	m_first_mode7_part2_weight_ise_range = MODE7_PART2_FIRST_ISE_RANGE;
-	m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
-
-	m_use_mode11_part2 = true;
-	m_mode11_part2_part_masks = UINT32_MAX;
-	m_first_mode11_part2_weight_ise_range = MODE11_PART2_FIRST_ISE_RANGE;
-	m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
-
-	m_refine_weights = true;
-
-	m_use_estimated_partitions = false;
-	m_max_estimated_partitions = 0;
-}
-
-void astc_hdr_codec_options::set_quality_normal()
-{
-	m_use_solid = true;
-
-	// We'll allow uber mode in normal if the user allows it.
-	m_use_mode11 = true;
-	m_mode11_uber_mode = true;
-	m_first_mode11_weight_ise_range = 6;
-	m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
-
-	m_use_mode7_part1 = true;
-	m_first_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
-	m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
-
-	m_use_mode7_part2 = true;
-	m_mode7_part2_part_masks = UINT32_MAX;
-	m_first_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
-	m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
-
-	m_use_mode11_part2 = true;
-	m_mode11_part2_part_masks = UINT32_MAX;
-	m_first_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
-	m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
-
-	m_refine_weights = true;
-}
-
-void astc_hdr_codec_options::set_quality_fastest()
-{
-	m_use_solid = true;
-
-	m_use_mode11 = true;
-	m_mode11_uber_mode = false;
-	m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
-	m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
-
-	m_use_mode7_part1 = false;
-	m_use_mode7_part2 = false;
-	m_use_mode11_part2 = false;
-
-	m_refine_weights = false;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-void astc_hdr_codec_options::set_quality_level(int level)
-{
-	level = clamp(level, cMinLevel, cMaxLevel);
-	
-	m_level = level;
-
-	switch (level)
-	{
-	case 0:
-	{
-		set_quality_fastest();
-		break;
-	}
-	case 1:
-	{
-		set_quality_normal();
-
-		m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE - 1;
-		m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
-
-		m_use_mode7_part1 = false;
-		m_use_mode7_part2 = false;
-
-		m_use_estimated_partitions = true;
-		m_max_estimated_partitions = 1;
-
-		m_mode11_part2_part_masks = 1 | 2;
-		m_mode7_part2_part_masks = 1 | 2;
-		break;
-	}
-	case 2:
-	{
-		set_quality_normal();
-
-		m_use_estimated_partitions = true;
-		m_max_estimated_partitions = 2;
-
-		m_mode11_part2_part_masks = 1 | 2;
-		m_mode7_part2_part_masks = 1 | 2;
-
-		break;
-	}
-	case 3:
-	{
-		set_quality_best();
-
-		m_use_estimated_partitions = true;
-		m_max_estimated_partitions = 2;
-
-		m_mode11_part2_part_masks = 1 | 2 | 4 | 8;
-		m_mode7_part2_part_masks = 1 | 2 | 4 | 8;
-
-		break;
-	}
-	case 4:
-	{
-		set_quality_best();
-
-		break;
-	}
-	}
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-#if 0
-static inline half_float qlog12_to_half_slow(uint32_t qlog12)
-{
-	return qlog_to_half_slow(qlog12, 12);
-}
-#endif
-
-// max usable qlog8 value is 247, 248=inf, >=249 is nan
-// max usable qlog7 value is 123, 124=inf, >=125 is nan
-
-// To go from a smaller qlog to an larger one, shift left by X bits.
-
-//const uint32_t TOTAL_USABLE_QLOG8 = 248; // 0-247 are usable, 0=0, 247=60416.0, 246=55296.0
-
-// for qlog7's shift left by 1
-//half_float g_qlog8_to_half[256];
-//float g_qlog8_to_float[256];
-
-//half_float g_qlog12_to_half[4096];
-//float g_qlog12_to_float[4096];
-
-static half_float g_qlog16_to_half[65536];
-
-inline half_float qlog_to_half(uint32_t val, uint32_t bits)
-{
-	assert((bits >= 5) && (bits <= 16));
-	assert(val < (1U << bits));
-	return g_qlog16_to_half[val << (16 - bits)];
-}
-
-// nearest values given a positive half float value (only)
-static uint16_t g_half_to_qlog7[32768], g_half_to_qlog8[32768], g_half_to_qlog9[32768], g_half_to_qlog10[32768], g_half_to_qlog11[32768], g_half_to_qlog12[32768];
-
-const uint32_t HALF_TO_QLOG_TABS_BASE = 7;
-static uint16_t* g_pHalf_to_qlog_tabs[8] =
-{
-	g_half_to_qlog7,
-	g_half_to_qlog8,
-
-	g_half_to_qlog9,
-	g_half_to_qlog10,
-
-	g_half_to_qlog11,
-	g_half_to_qlog12
-};
-
-static inline uint32_t half_to_qlog7_12(half_float h, uint32_t bits)
-{
-	assert((bits >= HALF_TO_QLOG_TABS_BASE) && (bits <= 12));
-	assert(h < 32768);
-
-	return g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE][h];
-}
-
-#if 0
-// Input is the low 11 bits of the qlog
-// Returns the 10-bit mantissa of the half float value
-static int qlog11_to_half_float_mantissa(int M)
-{
-	assert(M <= 0x7FF);
-	int Mt;
-	if (M < 512)
-		Mt = 3 * M;
-	else if (M >= 1536)
-		Mt = 5 * M - 2048;
-	else
-		Mt = 4 * M - 512;
-	return (Mt >> 3);
-}
-#endif
-
-// Input is the 10-bit mantissa of the half float value
-// Output is the 11-bit qlog value
-// Inverse of qlog11_to_half_float_mantissa()
-static inline int half_float_mantissa_to_qlog11(int hf)
-{
-	int q0 = (hf * 8 + 2) / 3;
-	int q1 = (hf * 8 + 2048 + 4) / 5;
-
-	if (q0 < 512)
-		return q0;
-	else if (q1 >= 1536)
-		return q1;
-
-	int q2 = (hf * 8 + 512 + 2) / 4;
-	return q2;
-}
-
-static inline int half_to_qlog16(int hf)
-{
-	// extract 5 bits exponent, which is carried through to qlog16 unchanged
-	const int exp = (hf >> 10) & 0x1F;
-
-	// extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless)
-	const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF);
-	assert(mantissa <= 0x7FF);
-
-	// Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights.
-	uint32_t qlog16 = (exp << 11) | mantissa;
-
-	// should be a lossless operation
-	assert(qlog16_to_half_slow(qlog16) == hf);
-
-	return qlog16;
-}
-
-static inline uint32_t quant_qlog16(uint32_t q16, uint32_t desired_bits)
-{
-	assert((desired_bits >= 7) && (desired_bits <= 12));
-	assert(q16 <= 65535);
-
-	const uint32_t shift = 16 - desired_bits;
-	uint32_t e = (q16 + (1U << (shift - 1U)) - 1U) >> shift;
-
-	uint32_t max_val = (1U << desired_bits) - 1U;
-	e = minimum<uint32_t>(e, max_val);
-
-	return e;
-}
-
-static void compute_half_to_qlog_table(uint32_t bits, uint16_t* pTable, const basisu::vector<float> &qlog16_to_float)
-{
-	assert(bits >= 5 && bits <= 12);
-	const uint32_t max_val = (1 << bits) - 1;
-
-	// For all positive half-floats
-	for (uint32_t h = 0; h < 32768; h++)
-	{
-		// Skip invalid values
-		if (is_half_inf_or_nan((half_float)h))
-			continue;
-		const float desired_val = half_to_float((half_float)h);
-
-		float best_err = 1e+30f;
-		uint32_t best_qlog = 0;
-
-		// For all possible qlog's
-		for (uint32_t i = 0; i <= max_val; i++)
-		{
-			// Skip invalid values
-			float v = qlog16_to_float[i << (16 - bits)];
-			if (std::isnan(v))
-				continue;
-
-			// Compute error
-			float err = fabs(v - desired_val);
-
-			// Find best
-			if (err < best_err)
-			{
-				best_err = err;
-				best_qlog = i;
-			}
-		}
-
-		pTable[h] = (uint16_t)best_qlog;
-	}
-
-#if 0
-	uint32_t t = 0;
-
-	const uint32_t nb = 12;
-	int nb_shift = 16 - nb;
-
-	for (uint32_t q16 = 0; q16 < 65536; q16++)
-	{
-		half_float h = qlog16_to_half_slow(q16);
-		if (is_half_inf_or_nan(h))
-			continue;
-
-		int q7 = half_to_qlog7_12(h, nb);
-
-		uint32_t best_err = UINT32_MAX, best_l = 0;
-		for (int l = 0; l < (1 << nb); l++)
-		{
-			int dec_q16 = l << nb_shift;
-			int err = iabs(dec_q16 - q16);
-			if (err < best_err)
-			{
-				best_err = err;
-				best_l = l;
-			}
-		}
-
-		//int e = (q16 + 253) >> 9; // 345
-
-		int e = (q16 + (1 << (nb_shift - 1)) - 1) >> nb_shift; // 285
-		if (best_l != e)
-			//if (q7 != best_l)
-		{
-			printf("q16=%u, h=%u, q7=%u, e=%u, best_l=%u\n", q16, h, q7, e, best_l);
-			t++;
-		}
-	}
-
-	printf("Mismatches: %u\n", t);
-	exit(0);
-#endif
-}
-
-static void init_qlog_tables()
-{
-	basisu::vector<float> qlog16_to_float(65536);
-
-	// for all possible qlog16, compute the corresponding half float
-	for (uint32_t i = 0; i <= 65535; i++)
-	{
-		half_float h = qlog16_to_half_slow(i);
-		g_qlog16_to_half[i] = h;
-
-		qlog16_to_float[i] = half_to_float(h);
-	}
-
-	// for all possible half floats, find the nearest qlog5-12 float
-	for (uint32_t bits = HALF_TO_QLOG_TABS_BASE; bits <= 12; bits++)
-	{
-		compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE], qlog16_to_float);
-	}
-}
-
-// [ise_range][0] = # levels
-// [ise_range][1...] = lerp value [0,64]
-// in ASTC order
-// Supported ISE weight ranges: 0 to 10, 11 total
-const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = 1; // ISE 1=3 levels
-const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = 10; // ISE 10=24 levels
-
-static const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][32] =
-{
-	{ 0 }, // ise range=0 is invalid for 4x4 block sizes (<24 weight bits in the block)
-	{ 3, 0, 32, 64 }, // 1
-	{ 4, 0, 21, 43, 64 }, // 2
-	{ 5, 0, 16, 32, 48, 64 }, // 3
-	{ 6, 0, 64, 12, 52, 25, 39 }, // 4
-	{ 8, 0, 9, 18, 27, 37, 46, 55, 64 }, // 5
-	{ 10, 0, 64, 7, 57, 14, 50, 21, 43, 28, 36 }, // 6
-	{ 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7
-	{ 16, 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }, // 8
-	{ 20, 0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35 }, // 9
-	{ 24, 0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34 } // 10
-};
-
-//{ 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7
-//static const uint8_t g_weight_order_7[12] = { 0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1 };
-
-static vec3F calc_mean(uint32_t num_pixels, const vec4F* pPixels)
-{
-	vec3F mean(0.0f);
-
-	for (uint32_t i = 0; i < num_pixels; i++)
-	{
-		const vec4F& p = pPixels[i];
-
-		mean[0] += p[0];
-		mean[1] += p[1];
-		mean[2] += p[2];
-	}
-
-	return mean / static_cast<float>(num_pixels);
-}
-
-static vec3F calc_rgb_pca(uint32_t num_pixels, const vec4F* pPixels, const vec3F& mean_color)
-{
-	float cov[6] = { 0, 0, 0, 0, 0, 0 };
-
-	for (uint32_t i = 0; i < num_pixels; i++)
-	{
-		const vec4F& v = pPixels[i];
-
-		float r = v[0] - mean_color[0];
-		float g = v[1] - mean_color[1];
-		float b = v[2] - mean_color[2];
-
-		cov[0] += r * r;
-		cov[1] += r * g;
-		cov[2] += r * b;
-		cov[3] += g * g;
-		cov[4] += g * b;
-		cov[5] += b * b;
-	}
-
-	float xr = .9f, xg = 1.0f, xb = .7f;
-	for (uint32_t iter = 0; iter < 3; iter++)
-	{
-		float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
-		float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
-		float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
-
-		float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
-
-		if (m > 1e-10f)
-		{
-			m = 1.0f / m;
-
-			r *= m;
-			g *= m;
-			b *= m;
-		}
-
-		xr = r;
-		xg = g;
-		xb = b;
-	}
-
-	float len = xr * xr + xg * xg + xb * xb;
-
-	vec3F axis;
-	if (len < 1e-10f)
-		axis.set(0.0f);
-	else
-	{
-		len = 1.0f / sqrtf(len);
-
-		xr *= len;
-		xg *= len;
-		xb *= len;
-
-		axis.set(xr, xg, xb, 0);
-	}
-
-	if (axis.dot(axis) < .5f)
-	{
-		axis.set(1.0f, 1.0f, 1.0f, 0.0f);
-		axis.normalize_in_place();
-	}
-
-	return axis;
-}
-
-static vec3F interp_color(const vec3F& mean, const vec3F& dir, float df, const aabb3F& colorspace_box, const aabb3F& input_box, bool* pInside = nullptr)
-{
-#if 0
-	assert(mean[0] >= input_box[0][0]);
-	assert(mean[1] >= input_box[0][1]);
-	assert(mean[2] >= input_box[0][2]);
-	assert(mean[0] <= input_box[1][0]);
-	assert(mean[1] <= input_box[1][1]);
-	assert(mean[2] <= input_box[1][2]);
-#endif
-
-	if (pInside)
-		*pInside = false;
-
-	vec3F k(mean + dir * df);
-	if (colorspace_box.contains(k))
-	{
-		if (pInside)
-			*pInside = true;
-
-		return k;
-	}
-
-	// starts inside
-	vec3F s(mean);
-
-	// ends outside
-	vec3F e(mean + dir * df);
-
-	// a ray guaranteed to go from the outside to inside
-	ray3F r(e, (s - e).normalize_in_place());
-	vec3F c;
-	float t = 0.0f;
-
-	intersection::result res = intersection::ray_aabb(c, t, r, input_box);
-	if (res != intersection::cSuccess)
-		c = k;
-
-	return c;
-}
-
-// all in Q16 space, 0-65535
-static bool compute_least_squares_endpoints_rgb(
-	uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights,
-	vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box)
-{
-	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
-	// https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
-	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
-	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
-	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
-	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
-	float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
-
-	for (uint32_t i = 0; i < N; i++)
-	{
-		const uint32_t sel = pSelectors[i];
-		z00 += pSelector_weights[sel][0];
-		z10 += pSelector_weights[sel][1];
-		z11 += pSelector_weights[sel][2];
-
-		float w = pSelector_weights[sel][3];
-		q00_r += w * pColors[i][0];
-		t_r += pColors[i][0];
-
-		q00_g += w * pColors[i][1];
-		t_g += pColors[i][1];
-
-		q00_b += w * pColors[i][2];
-		t_b += pColors[i][2];
-	}
-
-	q10_r = t_r - q00_r;
-	q10_g = t_g - q00_g;
-	q10_b = t_b - q00_b;
-
-	z01 = z10;
-
-	float det = z00 * z11 - z01 * z10;
-	if (det == 0.0f)
-		return false;
-
-	det = 1.0f / det;
-
-	float iz00, iz01, iz10, iz11;
-	iz00 = z11 * det;
-	iz01 = -z01 * det;
-	iz10 = -z10 * det;
-	iz11 = z00 * det;
-
-	(*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r);
-	(*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r);
-
-	(*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g);
-	(*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g);
-
-	(*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b);
-	(*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b);
-
-	for (uint32_t c = 0; c < 3; c++)
-	{
-		float l = (*pXl)[c], h = (*pXh)[c];
-
-		if (input_box.get_dim(c) < .0000125f)
-		{
-			l = input_box[0][c];
-			h = input_box[1][c];
-		}
-
-		(*pXl)[c] = l;
-		(*pXh)[c] = h;
-	}
-
-	vec3F mean((*pXl + *pXh) * .5f);
-	vec3F dir(*pXh - *pXl);
-
-	float ln = dir.length();
-	if (ln)
-	{
-		dir /= ln;
-
-		float ld = (*pXl - mean).dot(dir);
-		float hd = (*pXh - mean).dot(dir);
-
-		aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL));
-
-		bool was_inside1 = false;
-
-		vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1);
-		if (!was_inside1)
-			*pXl = l;
-
-		bool was_inside2 = false;
-		vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2);
-		if (!was_inside2)
-			*pXh = h;
-	}
-
-	pXl->clamp(0.0f, MAX_QLOG16_VAL);
-	pXh->clamp(0.0f, MAX_QLOG16_VAL);
-
-	return true;
-}
-
-static vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24];
-
-static uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][astc_index] -> linear index
-static uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][linear_index] -> astc_index
-
-static void encode_astc_hdr_init()
-{
-	// Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
-	for (uint32_t range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; range++)
-	{
-		const uint32_t num_levels = g_ise_weight_lerps[range][0];
-		assert((num_levels >= 3) && (num_levels <= 24));
-
-		for (uint32_t i = 0; i < num_levels; i++)
-		{
-			float w = g_ise_weight_lerps[range][1 + i] * (1.0f / 64.0f);
-
-			g_astc_ls_weights_ise[range][i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w);
-		}
-	}
-
-	for (uint32_t ise_range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; ise_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; ise_range++)
-	{
-		const uint32_t num_levels = g_ise_weight_lerps[ise_range][0];
-		assert((num_levels >= 3) && (num_levels <= 24));
-
-		uint32_t s[32];
-		for (uint32_t i = 0; i < num_levels; i++)
-			s[i] = (g_ise_weight_lerps[ise_range][1 + i] << 8) + i;
-
-		std::sort(s, s + num_levels);
-
-		for (uint32_t i = 0; i < num_levels; i++)
-			g_map_linear_to_astc_order[ise_range][i] = (uint8_t)(s[i] & 0xFF);
-
-		for (uint32_t i = 0; i < num_levels; i++)
-			g_map_astc_to_linear_order[ise_range][g_map_linear_to_astc_order[ise_range][i]] = (uint8_t)i;
-	}
-}
-
-void interpolate_qlog12_colors(
-	const int e[2][3],
-	half_float* pDecoded_half,
-	vec3F* pDecoded_float,
-	uint32_t n, uint32_t ise_weight_range)
-{
-	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
-
-	for (uint32_t i = 0; i < 2; i++)
-	{
-		for (uint32_t j = 0; j < 3; j++)
-		{
-			assert(in_range(e[i][j], 0, 0xFFF));
-		}
-	}
-
-	for (uint32_t i = 0; i < n; i++)
-	{
-		const int c = g_ise_weight_lerps[ise_weight_range][1 + i];
-		assert(c == (int)astc_helpers::dequant_bise_weight(i, ise_weight_range));
-
-		half_float rf, gf, bf;
-
-		{
-			uint32_t r0 = e[0][0] << 4;
-			uint32_t r1 = e[1][0] << 4;
-			int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
-			rf = qlog16_to_half_slow(ri);
-		}
-
-		{
-			uint32_t g0 = e[0][1] << 4;
-			uint32_t g1 = e[1][1] << 4;
-			int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
-			gf = qlog16_to_half_slow(gi);
-		}
-
-		{
-			uint32_t b0 = e[0][2] << 4;
-			uint32_t b1 = e[1][2] << 4;
-			int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
-			bf = qlog16_to_half_slow(bi);
-		}
-
-		if (pDecoded_half)
-		{
-			pDecoded_half[i * 3 + 0] = rf;
-			pDecoded_half[i * 3 + 1] = gf;
-			pDecoded_half[i * 3 + 2] = bf;
-		}
-
-		if (pDecoded_float)
-		{
-			pDecoded_float[i][0] = half_to_float(rf);
-			pDecoded_float[i][1] = half_to_float(gf);
-			pDecoded_float[i][2] = half_to_float(bf);
-		}
-	}
-}
-
-// decoded in ASTC order, not linear order
-// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded
-bool get_astc_hdr_mode_11_block_colors(
-	const uint8_t* pEndpoints,
-	half_float* pDecoded_half,
-	vec3F* pDecoded_float,
-	uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range)
-{
-	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
-
-	int e[2][3];
-	if (!decode_mode11_to_qlog12(pEndpoints, e, ise_endpoint_range))
-		return false;
-
-	interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range);
-
-	return true;
-}
-
-// decoded in ASTC order, not linear order
-// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded
-bool get_astc_hdr_mode_7_block_colors(
-	const uint8_t* pEndpoints,
-	half_float* pDecoded_half,
-	vec3F* pDecoded_float,
-	uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range)
-{
-	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
-
-	int e[2][3];
-	if (!decode_mode7_to_qlog12(pEndpoints, e, nullptr, ise_endpoint_range))
-		return false;
-
-	interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range);
-
-	return true;
-}
-
-// Fast high precision piecewise linear approximation of log2(bias+x).
-// Half may be zero, positive or denormal. No NaN/Inf/negative.
-static inline double q(half_float x)
-{
-	union { float f; int32_t i; uint32_t u; } fi;
-
-	fi.f = fast_half_to_float_pos_not_inf_or_nan(x);
-
-	assert(fi.f >= 0.0f);
-
-	fi.f += .125f;
-
-	return (double)fi.u; // approx log2f(fi.f), need to return double for the precision
-}
-
-double eval_selectors(
-	uint32_t num_pixels,
-	uint8_t* pWeights,
-	const half_float* pBlock_pixels_half,
-	uint32_t num_weight_levels,
-	const half_float* pDecoded_half,
-	const astc_hdr_codec_options& coptions,
-	uint32_t usable_selector_bitmask)
-{
-	assert((num_pixels >= 1) && (num_pixels <= 16));
-	assert(usable_selector_bitmask);
-
-	const float R_WEIGHT = coptions.m_r_err_scale;
-	const float G_WEIGHT = coptions.m_g_err_scale;
-
-	double total_error = 0;
-
-#ifdef _DEBUG
-	for (uint32_t i = 0; i < num_weight_levels; i++)
-	{
-		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0]));
-		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1]));
-		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2]));
-	}
-#endif
-
-	for (uint32_t p = 0; p < num_pixels; p++)
-	{
-		const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
-
-		double lowest_e = 1e+30f;
-
-		// this is an approximation of MSLE
-		for (uint32_t i = 0; i < num_weight_levels; i++)
-		{
-			if (((1 << i) & usable_selector_bitmask) == 0)
-				continue;
-
-			// compute piecewise linear approximation of log2(a+eps)-log2(b+eps), for each component, then MSLE
-			double rd = q(pDecoded_half[i * 3 + 0]) - q(pDesired_half[0]);
-			double gd = q(pDecoded_half[i * 3 + 1]) - q(pDesired_half[1]);
-			double bd = q(pDecoded_half[i * 3 + 2]) - q(pDesired_half[2]);
-
-			double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
-
-			if (e < lowest_e)
-			{
-				lowest_e = e;
-				pWeights[p] = (uint8_t)i;
-			}
-		}
-
-		total_error += lowest_e;
-
-	} // p
-
-	return total_error;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-double compute_block_error(const half_float* pOrig_block, const half_float* pPacked_block, const astc_hdr_codec_options& coptions)
-{
-	const float R_WEIGHT = coptions.m_r_err_scale;
-	const float G_WEIGHT = coptions.m_g_err_scale;
-
-	double total_error = 0;
-		
-	for (uint32_t p = 0; p < 16; p++)
-	{
-		double rd = q(pOrig_block[p * 3 + 0]) - q(pPacked_block[p * 3 + 0]);
-		double gd = q(pOrig_block[p * 3 + 1]) - q(pPacked_block[p * 3 + 1]);
-		double bd = q(pOrig_block[p * 3 + 2]) - q(pPacked_block[p * 3 + 2]);
-
-		double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
-
-		total_error += e;
-	}
-
-	return total_error;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static inline int compute_clamped_val(int v, int l, int h, bool& did_clamp, int& max_clamp_mag)
-{
-	assert(l < h);
-
-	if (v < l)
-	{
-		max_clamp_mag = basisu::maximum<int>(max_clamp_mag, l - v);
-
-		v = l;
-		did_clamp = true;
-	}
-	else if (v > h)
-	{
-		max_clamp_mag = basisu::maximum<int>(max_clamp_mag, v - h);
-
-		v = h;
-		did_clamp = true;
-	}
-
-	return v;
-}
-
-static bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag)
-{
-	assert(submode <= 7);
-
-	const uint8_t s_b_bits[8] = { 7, 8, 6, 7,  8, 6, 7, 6 };
-	const uint8_t s_c_bits[8] = { 6, 6, 7, 7,  6, 7, 7, 7 };
-	const uint8_t s_d_bits[8] = { 7, 6, 7, 6,  5, 6, 5, 6 };
-
-	const uint32_t a_bits = 9 + (submode >> 1);
-	const uint32_t b_bits = s_b_bits[submode];
-	const uint32_t c_bits = s_c_bits[submode];
-	const uint32_t d_bits = s_d_bits[submode];
-
-	const int max_a_val = (1 << a_bits) - 1;
-	const int max_b_val = (1 << b_bits) - 1;
-	const int max_c_val = (1 << c_bits) - 1;
-
-	// The maximum usable value before it turns to NaN/Inf
-	const int max_a_qlog = get_max_qlog(a_bits);
-
-	const int min_d_val = -(1 << (d_bits - 1));
-	const int max_d_val = -min_d_val - 1;
-	assert((max_d_val - min_d_val + 1) == (1 << d_bits));
-
-	int val_q[2][3];
-
-	for (uint32_t c = 0; c < 3; c++)
-	{
-#if 1
-		// this is better
-		const half_float l = qlog16_to_half_slow((uint32_t)std::round(low_q16[c]));
-		val_q[0][c] = half_to_qlog7_12(l, a_bits);
-		
-		const half_float h = qlog16_to_half_slow((uint32_t)std::round(high_q16[c]));
-		val_q[1][c] = half_to_qlog7_12(h, a_bits);
-#else
-		val_q[0][c] = quant_qlog16((uint32_t)std::round(low_q16[c]), a_bits);
-		val_q[1][c] = quant_qlog16((uint32_t)std::round(high_q16[c]), a_bits);
-#endif
-				
-#if 1
-		if (val_q[0][c] == val_q[1][c])
-		{
-#if 0
-			if (l <= h)
-#else
-			if (low_q16[c] < high_q16[c])
-#endif
-			{
-				if (val_q[0][c])
-					val_q[0][c]--;
-
-				if (val_q[1][c] != max_a_val)
-					val_q[1][c]++;
-			}
-			else
-			{
-				if (val_q[0][c] != max_a_val)
-					val_q[0][c]++;
-
-				if (val_q[1][c])
-					val_q[1][c]--;
-			}
-		}
-#endif
-
-		val_q[0][c] = minimum<uint32_t>(val_q[0][c], max_a_qlog);
-		val_q[1][c] = minimum<uint32_t>(val_q[1][c], max_a_qlog);
-	}
-
-	int highest_q = -1, highest_val = 0, highest_comp = 0;
-
-	for (uint32_t v = 0; v < 2; v++)
-	{
-		for (uint32_t c = 0; c < 3; c++)
-		{
-			assert(val_q[v][c] >= 0 && val_q[v][c] <= max_a_val);
-
-			if (val_q[v][c] > highest_q)
-			{
-				highest_q = val_q[v][c];
-				highest_val = v;
-				highest_comp = c;
-			}
-		}
-	}
-
-	const bool had_tie = (val_q[highest_val ^ 1][highest_comp] == highest_q);
-
-	if (highest_val != 1)
-	{
-		for (uint32_t c = 0; c < 3; c++)
-		{
-			std::swap(val_q[0][c], val_q[1][c]);
-		}
-	}
-
-	if (highest_comp)
-	{
-		std::swap(val_q[0][0], val_q[0][highest_comp]);
-		std::swap(val_q[1][0], val_q[1][highest_comp]);
-	}
-
-	int orig_q[2][3];
-	memcpy(orig_q, val_q, sizeof(val_q));
-
-	// val[1][0] is now guaranteed to be highest
-	int best_va = 0, best_vb0 = 0, best_vb1 = 0, best_vc = 0, best_vd0 = 0, best_vd1 = 0;
-	int best_max_clamp_mag = 0;
-	bool best_did_clamp = false;
-	int best_q[2][3] = { { 0, 0, 0}, { 0, 0, 0 }  };
-	BASISU_NOTE_UNUSED(best_q);
-	uint32_t best_dist = UINT_MAX;
-
-	for (uint32_t pass = 0; pass < 2; pass++)
-	{
-		int trial_va = val_q[1][0];
-
-		assert(trial_va <= max_a_val);
-		assert(trial_va >= val_q[1][1]);
-		assert(trial_va >= val_q[1][2]);
-
-		assert(trial_va >= val_q[0][0]);
-		assert(trial_va >= val_q[0][1]);
-		assert(trial_va >= val_q[0][2]);
-
-		bool did_clamp = false;
-		int trial_max_clamp_mag = 0;
-
-		int trial_vb0 = compute_clamped_val(trial_va - val_q[1][1], 0, max_b_val, did_clamp, trial_max_clamp_mag);
-		int trial_vb1 = compute_clamped_val(trial_va - val_q[1][2], 0, max_b_val, did_clamp, trial_max_clamp_mag);
-		int trial_vc = compute_clamped_val(trial_va - val_q[0][0], 0, max_c_val, did_clamp, trial_max_clamp_mag);
-		int trial_vd0 = compute_clamped_val((trial_va - trial_vb0 - trial_vc) - val_q[0][1], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag);
-		int trial_vd1 = compute_clamped_val((trial_va - trial_vb1 - trial_vc) - val_q[0][2], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag);
-
-		if (!did_clamp)
-		{
-			// Make sure decoder gets the expected values
-			assert(trial_va == val_q[1][0]);
-			assert(trial_va - trial_vb0 == val_q[1][1]);
-			assert(trial_va - trial_vb1 == val_q[1][2]);
-
-			assert((trial_va - trial_vc) == val_q[0][0]);
-			assert((trial_va - trial_vb0 - trial_vc - trial_vd0) == val_q[0][1]);
-			assert((trial_va - trial_vb1 - trial_vc - trial_vd1) == val_q[0][2]);
-		}
-
-		const int r_e0 = clamp<int>(trial_va, 0, max_a_val);
-		const int r_e1 = clamp<int>(trial_va - trial_vb0, 0, max_a_val);
-		const int r_e2 = clamp<int>(trial_va - trial_vb1, 0, max_a_val);
-
-		const int r_f0 = clamp<int>(trial_va - trial_vc, 0, max_a_val);
-		const int r_f1 = clamp<int>(trial_va - trial_vb0 - trial_vc - trial_vd0, 0, max_a_val);
-		const int r_f2 = clamp<int>(trial_va - trial_vb1 - trial_vc - trial_vd1, 0, max_a_val);
-
-		assert(r_e0 <= max_a_qlog);
-		assert(r_e1 <= max_a_qlog);
-		assert(r_e2 <= max_a_qlog);
-
-		assert(r_f0 <= max_a_qlog);
-		assert(r_f1 <= max_a_qlog);
-		assert(r_f2 <= max_a_qlog);
-
-		if ((!did_clamp) || (!had_tie))
-		{
-			best_va = trial_va;
-			best_vb0 = trial_vb0;
-			best_vb1 = trial_vb1;
-			best_vc = trial_vc;
-			best_vd0 = trial_vd0;
-			best_vd1 = trial_vd1;
-			best_max_clamp_mag = trial_max_clamp_mag;
-			best_did_clamp = did_clamp;
-
-			best_q[1][0] = r_e0;
-			best_q[1][1] = r_e1;
-			best_q[1][2] = r_e2;
-			best_q[0][0] = r_f0;
-			best_q[0][1] = r_f1;
-			best_q[0][2] = r_f2;
-			break;
-		}
-
-		// we had a tie and it did clamp, try swapping L/H for a potential slight gain
-
-		const uint32_t r_dist1 = basisu::square<int>(r_e0 - val_q[1][0]) + basisu::square<int>(r_e1 - val_q[1][1]) + basisu::square<int>(r_e2 - val_q[1][2]);
-		const uint32_t r_dist0 = basisu::square<int>(r_f0 - val_q[0][0]) + basisu::square<int>(r_f1 - val_q[0][1]) + basisu::square<int>(r_f2 - val_q[0][2]);
-
-		const uint32_t total_dist = r_dist1 + r_dist0;
-
-		if (total_dist < best_dist)
-		{
-			best_dist = total_dist;
-
-			best_va = trial_va;
-			best_vb0 = trial_vb0;
-			best_vb1 = trial_vb1;
-			best_vc = trial_vc;
-			best_vd0 = trial_vd0;
-			best_vd1 = trial_vd1;
-			best_did_clamp = did_clamp;
-
-			best_q[1][0] = r_e0;
-			best_q[1][1] = r_e1;
-			best_q[1][2] = r_e2;
-			best_q[0][0] = r_f0;
-			best_q[0][1] = r_f1;
-			best_q[0][2] = r_f2;
-		}
-
-		for (uint32_t c = 0; c < 3; c++)
-			std::swap(val_q[0][c], val_q[1][c]);
-	}
-
-	// pack bits now
-	int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0;
-
-	int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0;
-	switch (submode)
-	{
-	case 0:
-		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
-		break;
-	case 1:
-		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
-		break;
-	case 2:
-		x0 = get_bit(best_va, 9); x1 = get_bit(best_vc, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
-		break;
-	case 3:
-		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 9); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
-		break;
-	case 4:
-		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10);
-		break;
-	case 5:
-		x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_vc, 7); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
-		break;
-	case 6:
-		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10);
-		break;
-	case 7:
-		x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
-		break;
-	default:
-		break;
-	}
-
-	// write mode
-	pack_bit(v1, 7, submode, 0);
-	pack_bit(v2, 7, submode, 1);
-	pack_bit(v3, 7, submode, 2);
-
-	// highest component
-	pack_bit(v4, 7, highest_comp, 0);
-	pack_bit(v5, 7, highest_comp, 1);
-
-	// write bit 8 of va
-	pack_bit(v1, 6, best_va, 8);
-
-	// extra bits
-	pack_bit(v2, 6, x0);
-	pack_bit(v3, 6, x1);
-	pack_bit(v4, 6, x2);
-	pack_bit(v5, 6, x3);
-	pack_bit(v4, 5, x4);
-	pack_bit(v5, 5, x5);
-
-	v0 = best_va & 0xFF;
-	v1 |= (best_vc & 63);
-	v2 |= (best_vb0 & 63);
-	v3 |= (best_vb1 & 63);
-	v4 |= (best_vd0 & 31);
-	v5 |= (best_vd1 & 31);
-
-	assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255));
-
-	pEndpoints[0] = (uint8_t)v0;
-	pEndpoints[1] = (uint8_t)v1;
-	pEndpoints[2] = (uint8_t)v2;
-	pEndpoints[3] = (uint8_t)v3;
-	pEndpoints[4] = (uint8_t)v4;
-	pEndpoints[5] = (uint8_t)v5;
-
-#ifdef _DEBUG
-	// Test for valid pack by unpacking
-	{
-		if (highest_comp)
-		{
-			std::swap(best_q[0][0], best_q[0][highest_comp]);
-			std::swap(best_q[1][0], best_q[1][highest_comp]);
-
-			std::swap(orig_q[0][0], orig_q[0][highest_comp]);
-			std::swap(orig_q[1][0], orig_q[1][highest_comp]);
-		}
-
-		int test_e[2][3];
-		decode_mode11_to_qlog12(pEndpoints, test_e, astc_helpers::BISE_256_LEVELS);
-		for (uint32_t i = 0; i < 2; i++)
-		{
-			for (uint32_t j = 0; j < 3; j++)
-			{
-				assert(best_q[i][j] == test_e[i][j] >> (12 - a_bits));
-
-				if (!best_did_clamp)
-				{
-					assert((orig_q[i][j] == test_e[i][j] >> (12 - a_bits)) ||
-						(orig_q[1 - i][j] == test_e[i][j] >> (12 - a_bits)));
-				}
-			}
-		}
-	}
-#endif
-
-	max_clamp_mag = best_max_clamp_mag;
-
-	return best_did_clamp;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static void pack_astc_mode11_direct(uint8_t* pEndpoints, const vec3F& l_q16, const vec3F& h_q16)
-{
-	for (uint32_t i = 0; i < 3; i++)
-	{
-		// TODO: This goes from QLOG16->HALF->QLOG8/7
-		half_float l_half = qlog16_to_half_slow(clamp((int)std::round(l_q16[i]), 0, 65535));
-		half_float h_half = qlog16_to_half_slow(clamp((int)std::round(h_q16[i]), 0, 65535));
-
-		int l_q, h_q;
-
-		if (i == 2)
-		{
-			l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)];
-			h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)];
-
-			l_q = minimum<uint32_t>(l_q, MAX_QLOG7);
-			h_q = minimum<uint32_t>(h_q, MAX_QLOG7);
-		}
-		else
-		{
-			l_q = g_half_to_qlog8[bounds_check((uint32_t)l_half, 0U, 32768U)];
-			h_q = g_half_to_qlog8[bounds_check((uint32_t)h_half, 0U, 32768U)];
-
-			l_q = minimum<uint32_t>(l_q, MAX_QLOG8);
-			h_q = minimum<uint32_t>(h_q, MAX_QLOG8);
-		}
-
-#if 1
-		if (l_q == h_q)
-		{
-			const int m = (i == 2) ? MAX_QLOG7 : MAX_QLOG8;
-
-			if (l_q16[i] <= h_q16[i])
-			{
-				if (l_q)
-					l_q--;
-
-				if (h_q != m)
-					h_q++;
-			}
-			else
-			{
-				if (h_q)
-					h_q--;
-
-				if (l_q != m)
-					l_q++;
-			}
-		}
-#endif
-				
-		if (i == 2)
-		{
-			assert(l_q <= (int)MAX_QLOG7 && h_q <= (int)MAX_QLOG7);
-			l_q |= 128;
-			h_q |= 128;
-		}
-		else
-		{
-			assert(l_q <= (int)MAX_QLOG8 && h_q <= (int)MAX_QLOG8);
-		}
-
-		pEndpoints[2 * i + 0] = (uint8_t)l_q;
-		pEndpoints[2 * i + 1] = (uint8_t)h_q;
-	}
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range)
-{
-	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
-
-	assert(submode <= 5);
-	max_clamp_mag = 0;
-
-	static const uint8_t s_r_bits[6] = { 11, 11, 10, 9, 8, 7 };
-	static const uint8_t s_g_b_bits[6] = { 5, 6, 5, 6, 7, 7 };
-	static const uint8_t s_s_bits[6] = { 7, 5, 8, 7, 6, 7 };
-
-	// The precision of the components
-	const uint32_t prec_bits = s_r_bits[submode];
-
-	int qlog[4], pack_bits[4];
-
-	for (uint32_t i = 0; i < 4; i++)
-	{
-		const float f = (i == 3) ? s_q16 : rgb_q16[i];
-
-		// The # of bits the component is packed into
-		if (i == 0)
-			pack_bits[i] = s_r_bits[submode];
-		else if (i == 3)
-			pack_bits[i] = s_s_bits[submode];
-		else
-			pack_bits[i] = s_g_b_bits[submode];
-
-#if 0
-		// this is slightly worse
-		// TODO: going from qlog16 to half loses some precision. Then going from half to qlog 7-12 will have extra error.
-		half_float h = qlog_to_half(clamp((int)std::round(f), 0, MAX_QLOG16), 16);
-		qlog[i] = half_to_qlog7_12((half_float)bounds_check((uint32_t)h, 0U, 32768U), prec_bits);
-#else
-		qlog[i] = quant_qlog16(clamp<int>((int)std::round(f), 0, MAX_QLOG16), prec_bits);
-
-		// Only bias if there are enough texel weights, 4=6 weights
-		if (ise_weight_range >= 4)
-		{
-			// Explictly bias the high color, and the scale up, to better exploit the weights.
-			// The quantized range also then encompases the complete input range.
-			const uint32_t max_val = (1 << prec_bits) - 1;
-			const uint32_t K = 3;
-			if (i == 3)
-			{
-				qlog[i] = minimum<uint32_t>(qlog[i] + K * 2, max_val);
-			}
-			else
-			{
-				qlog[i] = minimum<uint32_t>(qlog[i] + K, max_val);
-			}
-		}
-#endif
-
-		if (i != 3)
-			qlog[i] = minimum<uint32_t>(qlog[i], get_max_qlog(prec_bits));
-
-		// If S=0, we lose freedom for the texel weights to add any value.
-		if ((i == 3) && (qlog[i] == 0))
-			qlog[i] = 1;
-	}
-
-	uint32_t maj_index = 0;
-
-	bool did_clamp = false;
-
-	if (submode != 5)
-	{
-		int largest_qlog = 0;
-		for (uint32_t i = 0; i < 3; i++)
-		{
-			if (qlog[i] > largest_qlog)
-			{
-				largest_qlog = qlog[i];
-				maj_index = i;
-			}
-		}
-
-		if (maj_index)
-		{
-			std::swap(qlog[0], qlog[maj_index]);
-		}
-
-		assert(qlog[0] >= qlog[1]);
-		assert(qlog[0] >= qlog[2]);
-
-		qlog[1] = qlog[0] - qlog[1];
-		qlog[2] = qlog[0] - qlog[2];
-
-		for (uint32_t i = 1; i < 4; i++)
-		{
-			const int max_val = (1 << pack_bits[i]) - 1;
-
-			if (qlog[i] > max_val)
-			{
-				max_clamp_mag = maximum<int>(max_clamp_mag, qlog[i] - max_val);
-				qlog[i] = max_val;
-				did_clamp = true;
-			}
-		}
-	}
-
-	for (uint32_t i = 0; i < 4; i++)
-	{
-		const int max_val = (1 << pack_bits[i]) - 1; (void)max_val;
-
-		assert(qlog[i] <= max_val);
-	}
-
-	int mode = 0;
-
-	int r = qlog[0] & 63; // 6-bits
-	int g = qlog[1] & 31; // 5-bits
-	int b = qlog[2] & 31; // 5-bits
-	int s = qlog[3] & 31; // 5-bits
-
-	int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0;
-
-	switch (submode)
-	{
-	case 0:
-	{
-		mode = (maj_index << 2) | 0;
-		assert((mode & 0xC) != 0xC);
-
-		x0 = get_bit(qlog[0], 9); // R9
-		x1 = get_bit(qlog[0], 8); // R8
-		x2 = get_bit(qlog[0], 7); // R7
-		x3 = get_bit(qlog[0], 10); // R10
-		x4 = get_bit(qlog[0], 6); // R6 
-		x5 = get_bit(qlog[3], 6); // S6
-		x6 = get_bit(qlog[3], 5); // S5
-		break;
-	}
-	case 1:
-	{
-		mode = (maj_index << 2) | 1;
-		assert((mode & 0xC) != 0xC);
-
-		x0 = get_bit(qlog[0], 8); // R8
-		x1 = get_bit(qlog[1], 5); // G5
-		x2 = get_bit(qlog[0], 7); // R7
-		x3 = get_bit(qlog[2], 5); // B5
-		x4 = get_bit(qlog[0], 6); // R6 
-		x5 = get_bit(qlog[0], 10); // R10
-		x6 = get_bit(qlog[0], 9); // R9
-		break;
-	}
-	case 2:
-	{
-		mode = (maj_index << 2) | 2;
-		assert((mode & 0xC) != 0xC);
-
-		x0 = get_bit(qlog[0], 9); // R9
-		x1 = get_bit(qlog[0], 8); // R8
-		x2 = get_bit(qlog[0], 7); // R7
-		x3 = get_bit(qlog[0], 6); // R6
-		x4 = get_bit(qlog[3], 7); // S7 
-		x5 = get_bit(qlog[3], 6); // S6
-		x6 = get_bit(qlog[3], 5); // S5
-		break;
-	}
-	case 3:
-	{
-		mode = (maj_index << 2) | 3;
-		assert((mode & 0xC) != 0xC);
-
-		x0 = get_bit(qlog[0], 8); // R8
-		x1 = get_bit(qlog[1], 5); // G5
-		x2 = get_bit(qlog[0], 7); // R7
-		x3 = get_bit(qlog[2], 5); // B5
-		x4 = get_bit(qlog[0], 6); // R6 
-		x5 = get_bit(qlog[3], 6); // S6
-		x6 = get_bit(qlog[3], 5); // S5
-		break;
-	}
-	case 4:
-	{
-		mode = maj_index | 0xC; // 0b1100
-		assert((mode & 0xC) == 0xC);
-		assert(mode != 0xF);
-
-		x0 = get_bit(qlog[1], 6); // G6
-		x1 = get_bit(qlog[1], 5); // G5
-		x2 = get_bit(qlog[2], 6); // B6
-		x3 = get_bit(qlog[2], 5); // B5
-		x4 = get_bit(qlog[0], 6); // R6 
-		x5 = get_bit(qlog[0], 7); // R7
-		x6 = get_bit(qlog[3], 5); // S5
-		break;
-	}
-	case 5:
-	{
-		mode = 0xF;
-
-		x0 = get_bit(qlog[1], 6); // G6
-		x1 = get_bit(qlog[1], 5); // G5
-		x2 = get_bit(qlog[2], 6); // B6
-		x3 = get_bit(qlog[2], 5); // B5
-		x4 = get_bit(qlog[0], 6); // R6 
-		x5 = get_bit(qlog[3], 6); // S6
-		x6 = get_bit(qlog[3], 5); // S5
-		break;
-	}
-	default:
-	{
-		assert(0);
-		break;
-	}
-	}
-
-	pEndpoints[0] = (uint8_t)((get_bit(mode, 1) << 7) | (get_bit(mode, 0) << 6) | r);
-	pEndpoints[1] = (uint8_t)((get_bit(mode, 2) << 7) | (x0 << 6) | (x1 << 5) | g);
-	pEndpoints[2] = (uint8_t)((get_bit(mode, 3) << 7) | (x2 << 6) | (x3 << 5) | b);
-	pEndpoints[3] = (uint8_t)((x4 << 7) | (x5 << 6) | (x6 << 5) | s);
-
-#ifdef _DEBUG
-	// Test for valid pack by unpacking
-	{
-		const int inv_shift = 12 - prec_bits;
-
-		int unpacked_e[2][3];
-		if (submode != 5)
-		{
-			unpacked_e[1][0] = left_shift32(qlog[0], inv_shift);
-			unpacked_e[1][1] = clamp(left_shift32((qlog[0] - qlog[1]), inv_shift), 0, 0xFFF);
-			unpacked_e[1][2] = clamp(left_shift32((qlog[0] - qlog[2]), inv_shift), 0, 0xFFF);
-
-			unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF);
-			unpacked_e[0][1] = clamp(left_shift32(((qlog[0] - qlog[1]) - qlog[3]), inv_shift), 0, 0xFFF);
-			unpacked_e[0][2] = clamp(left_shift32(((qlog[0] - qlog[2]) - qlog[3]), inv_shift), 0, 0xFFF);
-		}
-		else
-		{
-			unpacked_e[1][0] = left_shift32(qlog[0], inv_shift);
-			unpacked_e[1][1] = left_shift32(qlog[1], inv_shift);
-			unpacked_e[1][2] = left_shift32(qlog[2], inv_shift);
-
-			unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF);
-			unpacked_e[0][1] = clamp(left_shift32((qlog[1] - qlog[3]), inv_shift), 0, 0xFFF);
-			unpacked_e[0][2] = clamp(left_shift32((qlog[2] - qlog[3]), inv_shift), 0, 0xFFF);
-		}
-
-		if (maj_index)
-		{
-			std::swap(unpacked_e[0][0], unpacked_e[0][maj_index]);
-			std::swap(unpacked_e[1][0], unpacked_e[1][maj_index]);
-		}
-
-		int e[2][3];
-		decode_mode7_to_qlog12_ise20(pEndpoints, e, nullptr);
-
-		for (uint32_t i = 0; i < 3; i++)
-		{
-			assert(unpacked_e[0][i] == e[0][i]);
-			assert(unpacked_e[1][i] == e[1][i]);
-		}
-	}
-#endif
-
-	return did_clamp;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static void quantize_ise_endpoints(uint32_t ise_endpoint_range, const uint8_t* pSrc_endpoints, uint8_t *pDst_endpoints, uint32_t n)
-{
-	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
-
-	if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
-	{
-		memcpy(pDst_endpoints, pSrc_endpoints, n);
-	}
-	else
-	{
-		for (uint32_t i = 0; i < n; i++)
-		{
-			uint32_t v = pSrc_endpoints[i];
-			assert(v <= 255);
-
-			pDst_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_val_to_ise[v];
-		}
-	}
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-// Note this could fail to find any valid solution if use_endpoint_range!=20.
-// Returns true if improved.
-static bool try_mode11(uint32_t num_pixels,
-	uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
-	vec3F& low_color_q16, const vec3F& high_color_q16,
-	half_float block_pixels_half[16][3],
-	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, bool direct_only, uint32_t ise_endpoint_range, 
-	bool constrain_ise_weight8_selectors, 
-	int32_t first_submode, int32_t last_submode) // -1, 7
-{
-	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
-	assert((num_weight_levels >= 3) && (num_weight_levels <= 32));
-	assert((num_pixels >= 1) && (num_pixels <= 16));
-
-	bool improved_flag = false;
-
-	half_float decoded_half[32][3];
-	vec3F decoded_float[32];
-	uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16];
-
-	if (direct_only)
-	{
-		first_submode = -1;
-		last_submode = -1;
-	}
-
-	assert(first_submode <= last_submode);
-	assert((first_submode >= -1) && (first_submode <= 7));
-	assert((last_submode >= -1) && (last_submode <= 7));
-
-	// TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done.
-	for (int submode = last_submode; submode >= first_submode; submode--)
-	{
-		bool did_clamp = false;
-		int max_clamp_mag = 0;
-		if (submode == -1)
-		{
-			// If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision.
-			pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16);
-		}
-		else
-		{
-			did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag);
-
-			// If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts.
-			const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4;
-			if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
-				continue;
-		}
-				
-		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
-		// It could massively distort the endpoints, but still result in a valid encoding.
-		quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
-		
-		if (!get_astc_hdr_mode_11_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range))
-			continue;
-
-		uint32_t usable_selector_bitmask = UINT32_MAX;
-		if ((constrain_ise_weight8_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS))
-			usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15);
-
-		double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask);
-		if (trial_blk_error < cur_block_error)
-		{
-			cur_block_error = trial_blk_error;
-			memcpy(pEndpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
-			memcpy(pWeights, trial_weights, num_pixels);
-			submode_used = submode + 1;
-			improved_flag = true;
-		}
-
-		// If it didn't clamp it was a lossless encode at this precision, so we can stop early as there's probably no use trying lower precision submodes.
-		// (Although it may be, because a lower precision pack could try nearby voxel coords.)
-		// However, at lower levels quantization may cause the decoded endpoints to be very distorted, so we need to evaluate up to direct.
-		if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) 
-		{
-			if (!did_clamp)
-				break;
-		}
-	}
-
-	return improved_flag;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static bool try_mode7(
-	uint32_t num_pixels,
-	uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
-	vec3F& high_color_q16, const float s_q16,
-	half_float block_pixels_half[16][3],
-	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, 
-	uint32_t ise_endpoint_range)
-{
-	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
-	assert((num_pixels >= 1) && (num_pixels <= 16));
-
-	bool improved_flag = false;
-
-	half_float decoded_half[24][3];
-	vec3F decoded_float[24];
-
-	uint8_t orig_trial_endpoints[NUM_MODE7_ENDPOINTS], trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16];
-
-	// TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done.
-	for (int submode = 0; submode <= 5; submode++)
-	{
-		int max_clamp_mag = 0;
-		const bool did_clamp = pack_astc_mode7_submode(submode, orig_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range);
-
-		if (submode < 5)
-		{
-			const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4;
-			if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
-				continue;
-		}
-
-		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
-		// It could massively distort the endpoints, but still result in a valid encoding.
-		quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
-
-		if (!get_astc_hdr_mode_7_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range))
-			continue;
-
-		double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions);
-		if (trial_blk_error < cur_block_error)
-		{
-			cur_block_error = trial_blk_error;
-			memcpy(pEndpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
-			memcpy(pWeights, trial_weights, num_pixels);
-			submode_used = submode;
-			improved_flag = true;
-		}
-
-		if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
-		{
-			if (!did_clamp)
-				break;
-		}
-	}
-
-	return improved_flag;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static double encode_astc_hdr_block_mode_11(
-	uint32_t num_pixels,
-	const vec4F* pBlock_pixels,
-	uint32_t ise_weight_range,
-	uint32_t& best_submode,
-	double cur_block_error,
-	uint8_t* blk_endpoints, uint8_t* blk_weights,
-	const astc_hdr_codec_options& coptions,
-	bool direct_only,
-	uint32_t ise_endpoint_range,
-	bool uber_mode,
-	bool constrain_ise_weight8_selectors,
-	int32_t first_submode, int32_t last_submode)
-{
-	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
-	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
-	assert((num_pixels >= 1) && (num_pixels <= 16));
-
-	best_submode = 0;
-
-	half_float block_pixels_half[16][3];
-	vec4F block_pixels_q16[16];
-		
-	// TODO: This is done redundantly.
-	for (uint32_t i = 0; i < num_pixels; i++)
-	{
-		block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
-		block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]);
-
-		block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
-		block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]);
-
-		block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
-		block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]);
-
-		block_pixels_q16[i][3] = 0.0f;
-	}
-
-	const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
-	
-	// TODO: should match MAX_SUPPORTED_ISE_WEIGHT_INDEX
-	const uint32_t MAX_WEIGHT_LEVELS = 32;
-	(void)MAX_WEIGHT_LEVELS;
-	assert(num_weight_levels <= MAX_WEIGHT_LEVELS);
-
-	vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16));
-	vec3F block_axis_q16(calc_rgb_pca(num_pixels, block_pixels_q16, block_mean_color_q16));
-
-	aabb3F color_box_q16(cInitExpand);
-
-	float l = 1e+30f, h = -1e+30f;
-	vec3F low_color_q16, high_color_q16;
-
-	for (uint32_t i = 0; i < num_pixels; i++)
-	{
-		color_box_q16.expand(block_pixels_q16[i]);
-
-		vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16);
-		float kd = k.dot(block_axis_q16);
-
-		if (kd < l)
-		{
-			l = kd;
-			low_color_q16 = block_pixels_q16[i];
-		}
-
-		if (kd > h)
-		{
-			h = kd;
-			high_color_q16 = block_pixels_q16[i];
-		}
-	}
-
-	vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16);
-	for (uint32_t i = 0; i < 3; i++)
-	{
-		low_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f);
-		high_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f);
-	}
-		
-	uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS];
-	uint8_t trial_blk_weights[16];
-	uint32_t trial_best_submode = 0;
-	
-	clear_obj(trial_blk_endpoints);
-	clear_obj(trial_blk_weights);
-	
-	double trial_blk_error = 1e+30f;
-
-	bool did_improve = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
-		low_color_q16, high_color_q16,
-		block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
-		first_submode, last_submode);
-	
-	// If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
-	if (!did_improve)
-		return cur_block_error;
-
-	// Did the solution improve?
-	if (trial_blk_error < cur_block_error)
-	{
-		cur_block_error = trial_blk_error;
-		memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS);
-		memcpy(blk_weights, trial_blk_weights, num_pixels);
-		best_submode = trial_best_submode;
-	}
-		
-#define USE_LEAST_SQUARES (1)
-#if USE_LEAST_SQUARES
-	// least squares on the most promising trial weight indices found
-	const uint32_t NUM_LS_PASSES = 3;
-
-	for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++)
-	{
-		vec3F l_q16, h_q16;
-		if (!compute_least_squares_endpoints_rgb(num_pixels, trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
-			break;
-
-		bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-			l_q16, h_q16,
-			block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
-			first_submode, last_submode);
-
-		if (!was_improved)
-			break;
-
-		// It's improved, so let's take the new weight indices.
-		memcpy(trial_blk_weights, blk_weights, num_pixels);
-
-	} // pass
-#endif
-		
-	if (uber_mode)
-	{
-		// Try varying the current best weight indices. This can be expanded/improved, but at potentially great cost.
-
-		uint8_t temp_astc_weights[16];
-		memcpy(temp_astc_weights, trial_blk_weights, num_pixels);
-
-		uint32_t min_lin_sel = 256, max_lin_sel = 0;
-		for (uint32_t i = 0; i < num_pixels; i++)
-		{
-			const uint32_t astc_sel = temp_astc_weights[i];
-
-			const uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
-			assert(lin_sel < num_weight_levels);
-
-			min_lin_sel = minimumu(min_lin_sel, lin_sel);
-			max_lin_sel = maximumu(max_lin_sel, lin_sel);
-		}
-
-		bool was_improved = false;
-		(void)was_improved;
-
-		{
-			bool weights_changed = false;
-			uint8_t trial_weights[16];
-			for (uint32_t i = 0; i < num_pixels; i++)
-			{
-				uint32_t astc_sel = temp_astc_weights[i];
-				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
-
-				if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1)))
-				{
-					lin_sel++;
-					weights_changed = true;
-				}
-
-				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
-			}
-
-			if (weights_changed)
-			{
-				vec3F l_q16, h_q16;
-				if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
-				{
-					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-						l_q16, h_q16,
-						block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, 
-						first_submode, last_submode))
-					{
-						was_improved = true;
-					}
-				}
-			}
-		}
-
-		{
-			bool weights_changed = false;
-			uint8_t trial_weights[16];
-			for (uint32_t i = 0; i < num_pixels; i++)
-			{
-				uint32_t astc_sel = temp_astc_weights[i];
-				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
-
-				if ((lin_sel == max_lin_sel) && (lin_sel > 0))
-				{
-					lin_sel--;
-					weights_changed = true;
-				}
-
-				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
-			}
-
-			if (weights_changed)
-			{
-				vec3F l_q16, h_q16;
-				if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
-				{
-					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-						l_q16, h_q16,
-						block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
-						first_submode, last_submode))
-					{
-						was_improved = true;
-					}
-				}
-			}
-		}
-
-		{
-			bool weights_changed = false;
-			uint8_t trial_weights[16];
-			for (uint32_t i = 0; i < num_pixels; i++)
-			{
-				uint32_t astc_sel = temp_astc_weights[i];
-				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
-
-				if ((lin_sel == max_lin_sel) && (lin_sel > 0))
-				{
-					lin_sel--;
-					weights_changed = true;
-				}
-				else if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1)))
-				{
-					lin_sel++;
-					weights_changed = true;
-				}
-
-				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
-			}
-
-			if (weights_changed)
-			{
-				vec3F l_q16, h_q16;
-				if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
-				{
-					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-						l_q16, h_q16,
-						block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
-						first_submode, last_submode))
-					{
-						was_improved = true;
-					}
-				}
-			}
-		}
-	} // uber_mode
-
-	return cur_block_error;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static double encode_astc_hdr_block_mode_7(
-	uint32_t num_pixels, const vec4F* pBlock_pixels,
-	uint32_t ise_weight_range,
-	uint32_t& best_submode,
-	double cur_block_error,
-	uint8_t* blk_endpoints,  //[4]
-	uint8_t* blk_weights, // [num_pixels]
-	const astc_hdr_codec_options& coptions,
-	uint32_t ise_endpoint_range)
-{
-	assert((num_pixels >= 1) && (num_pixels <= 16));
-	assert((ise_weight_range >= 1) && (ise_weight_range <= 10));
-	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
-	const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
-
-	const uint32_t MAX_WEIGHT_LEVELS = 24;
-	assert(num_weight_levels <= MAX_WEIGHT_LEVELS);
-	BASISU_NOTE_UNUSED(MAX_WEIGHT_LEVELS);
-
-	best_submode = 0;
-
-	half_float block_pixels_half[16][3];
-
-	vec4F block_pixels_q16[16];
-	for (uint32_t i = 0; i < num_pixels; i++)
-	{
-		block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
-		block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]);
-
-		block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
-		block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]);
-
-		block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
-		block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]);
-
-		block_pixels_q16[i][3] = 0.0f;
-	}
-
-	vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16));
-
-	vec3F block_axis_q16(0.577350259f);
-
-	aabb3F color_box_q16(cInitExpand);
-
-	float l = 1e+30f, h = -1e+30f;
-	for (uint32_t i = 0; i < num_pixels; i++)
-	{
-		color_box_q16.expand(block_pixels_q16[i]);
-
-		vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16);
-		float kd = k.dot(block_axis_q16);
-
-		l = basisu::minimum<float>(l, kd);
-		h = basisu::maximum<float>(h, kd);
-	}
-
-	vec3F low_color_q16(interp_color(block_mean_color_q16, block_axis_q16, l, color_box_q16, color_box_q16));
-	vec3F high_color_q16(interp_color(block_mean_color_q16, block_axis_q16, h, color_box_q16, color_box_q16));
-
-	low_color_q16.clamp(0.0f, MAX_QLOG16_VAL);
-	high_color_q16.clamp(0.0f, MAX_QLOG16_VAL);
-
-	vec3F diff(high_color_q16 - low_color_q16);
-	float s_q16 = diff.dot(block_axis_q16) * block_axis_q16[0];
-
-	uint8_t trial_blk_endpoints[NUM_MODE7_ENDPOINTS];
-	uint8_t trial_blk_weights[16];
-	uint32_t trial_best_submode = 0;
-
-	clear_obj(trial_blk_endpoints);
-	clear_obj(trial_blk_weights);
-
-	double trial_blk_error = 1e+30f;
-
-	bool did_improve = try_mode7(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
-		high_color_q16, ceilf(s_q16),
-		block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range);
-
-	// If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
-	if (!did_improve)
-	{
-		return cur_block_error;
-	}
-
-	// Did the solution improve?
-	if (trial_blk_error < cur_block_error)
-	{
-		cur_block_error = trial_blk_error;
-		memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE7_ENDPOINTS);
-		memcpy(blk_weights, trial_blk_weights, num_pixels);
-		best_submode = trial_best_submode;
-	}
-
-	const float one_over_num_pixels = 1.0f / (float)num_pixels;
-
-	const uint32_t NUM_TRIALS = 2;
-	for (uint32_t trial = 0; trial < NUM_TRIALS; trial++)
-	{
-		// Given a set of selectors and S, try to compute a better high color
-		vec3F new_high_color_q16(block_mean_color_q16);
-
-		int e[2][3];
-		int cur_s = 0;
-		if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, &cur_s, ise_endpoint_range))
-			break;
-
-		cur_s <<= 4;
-
-		for (uint32_t i = 0; i < num_pixels; i++)
-		{
-			uint32_t astc_sel = trial_blk_weights[i];
-			float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f);
-
-			float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels;
-			new_high_color_q16[0] += k;
-			new_high_color_q16[1] += k;
-			new_high_color_q16[2] += k;
-		}
-
-		bool improved = try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-			new_high_color_q16, (float)cur_s,
-			block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range);
-
-		if (improved)
-		{
-			memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS);
-			memcpy(trial_blk_weights, blk_weights, num_pixels);
-		}
-
-		// Given a set of selectors and a high color, try to compute a better S.
-		float t = 0.0f;
-
-		for (uint32_t i = 0; i < num_pixels; i++)
-		{
-			uint32_t astc_sel = trial_blk_weights[i];
-			float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f);
-
-			t += (1.0f) - lerp;
-		}
-
-		t *= one_over_num_pixels;
-
-		//int e[2][3];
-		if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, nullptr, ise_endpoint_range))
-			break;
-
-		vec3F cur_h_q16((float)(e[1][0] << 4), (float)(e[1][1] << 4), (float)(e[1][2] << 4));
-
-		if (fabs(t) > .0000125f)
-		{
-			float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t;
-			float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t;
-			float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t;
-
-			// TODO: gather statistics on these
-			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-				cur_h_q16, ceilf(s_r),
-				block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
-			{
-				improved = true;
-			}
-
-			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-				cur_h_q16, ceilf(s_g),
-				block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
-			{
-				improved = true;
-			}
-
-			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-				cur_h_q16, ceilf(s_b),
-				block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
-			{
-				improved = true;
-			}
-
-			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
-				cur_h_q16, ceilf((s_r + s_g + s_b) / 3.0f),
-				block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
-			{
-				improved = true;
-			}
-		}
-
-		if (!improved)
-			break;
-
-		memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS);
-		memcpy(trial_blk_weights, blk_weights, num_pixels);
-
-	} // trial
-
-	return cur_block_error;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static bool pack_solid(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions)
-{
-	float r = 0.0f, g = 0.0f, b = 0.0f;
-
-	const float LOG_BIAS = .125f;
-
-	bool solid_block = true;
-	for (uint32_t i = 0; i < 16; i++)
-	{
-		if ((pBlock_linear_colors[0][0] != pBlock_linear_colors[i][0]) ||
-			(pBlock_linear_colors[0][1] != pBlock_linear_colors[i][1]) ||
-			(pBlock_linear_colors[0][2] != pBlock_linear_colors[i][2]))
-		{
-			solid_block = false;
-		}
-
-		r += log2f(pBlock_linear_colors[i][0] + LOG_BIAS);
-		g += log2f(pBlock_linear_colors[i][1] + LOG_BIAS);
-		b += log2f(pBlock_linear_colors[i][2] + LOG_BIAS);
-	}
-
-	if (solid_block)
-	{
-		r = pBlock_linear_colors[0][0];
-		g = pBlock_linear_colors[0][1];
-		b = pBlock_linear_colors[0][2];
-	}
-	else
-	{
-		r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / 16.0f)) - LOG_BIAS);
-		g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / 16.0f)) - LOG_BIAS);
-		b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / 16.0f)) - LOG_BIAS);
-
-		// for safety
-		r = minimum<float>(r, MAX_HALF_FLOAT);
-		g = minimum<float>(g, MAX_HALF_FLOAT);
-		b = minimum<float>(b, MAX_HALF_FLOAT);
-	}
-
-	half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b), ah = float_to_half_non_neg_no_nan_inf(1.0f);
-
-	astc_hdr_pack_results results;
-	results.clear();
-
-	uint8_t* packed_blk = (uint8_t*)&results.m_solid_blk;
-	results.m_is_solid = true;
-
-	packed_blk[0] = 0b11111100;
-	packed_blk[1] = 255;
-	packed_blk[2] = 255;
-	packed_blk[3] = 255;
-	packed_blk[4] = 255;
-	packed_blk[5] = 255;
-	packed_blk[6] = 255;
-	packed_blk[7] = 255;
-
-	packed_blk[8] = (uint8_t)rh;
-	packed_blk[9] = (uint8_t)(rh >> 8);
-	packed_blk[10] = (uint8_t)gh;
-	packed_blk[11] = (uint8_t)(gh >> 8);
-	packed_blk[12] = (uint8_t)bh;
-	packed_blk[13] = (uint8_t)(bh >> 8);
-	packed_blk[14] = (uint8_t)ah;
-	packed_blk[15] = (uint8_t)(ah >> 8);
-
-	results.m_best_block_error = 0;
-
-	if (!solid_block)
-	{
-		const float R_WEIGHT = coptions.m_r_err_scale;
-		const float G_WEIGHT = coptions.m_g_err_scale;
-
-		// This MUST match how errors are computed in eval_selectors().
-		for (uint32_t i = 0; i < 16; i++)
-		{
-			half_float dr = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]), dg = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]), db = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]);
-			double rd = q(rh) - q(dr);
-			double gd = q(gh) - q(dg);
-			double bd = q(bh) - q(db);
-
-			double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
-
-			results.m_best_block_error += e;
-		}
-	}
-
-	const half_float hc[3] = { rh, gh, bh };
-
-	bc6h_enc_block_solid_color(&results.m_bc6h_block, hc);
-
-	all_results.push_back(results);
-
-	return solid_block;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static void pack_mode11(
-	const vec4F* pBlock_linear_colors, 
-	basisu::vector<astc_hdr_pack_results>& all_results, 
-	const astc_hdr_codec_options& coptions, 
-	uint32_t first_weight_ise_range, uint32_t last_weight_ise_range, bool constrain_ise_weight8_selectors)
-{
-	uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16];
-	uint32_t trial_submode11 = 0;
-
-	clear_obj(trial_endpoints);
-	clear_obj(trial_weights);
-		
-	for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++)
-	{
-		const bool direct_only = coptions.m_mode11_direct_only;
-		
-		uint32_t endpoint_ise_range = astc_helpers::BISE_256_LEVELS;
-		if (weight_ise_range == astc_helpers::BISE_16_LEVELS)
-			endpoint_ise_range = astc_helpers::BISE_192_LEVELS;
-		else
-		{
-			assert(weight_ise_range < astc_helpers::BISE_16_LEVELS);
-		}
-				
-		double trial_error = encode_astc_hdr_block_mode_11(16, pBlock_linear_colors, weight_ise_range, trial_submode11, 1e+30f, trial_endpoints, trial_weights, coptions, direct_only, 
-			endpoint_ise_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, constrain_ise_weight8_selectors, coptions.m_first_mode11_submode, coptions.m_last_mode11_submode);
-
-		if (trial_error < 1e+30f)
-		{
-			astc_hdr_pack_results results;
-			results.clear();
-
-			results.m_best_block_error = trial_error;
-
-			results.m_best_submodes[0] = trial_submode11;
-			results.m_constrained_weights = constrain_ise_weight8_selectors;
-						
-			results.m_best_blk.m_num_partitions = 1;
-			results.m_best_blk.m_color_endpoint_modes[0] = 11;
-			results.m_best_blk.m_weight_ise_range = weight_ise_range;
-			results.m_best_blk.m_endpoint_ise_range = endpoint_ise_range;
-			
-			memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
-			memcpy(results.m_best_blk.m_weights, trial_weights, 16);
-
-#ifdef _DEBUG
-			{
-				half_float block_pixels_half[16][3];
-
-				vec4F block_pixels_q16[16];
-				for (uint32_t i = 0; i < 16; i++)
-				{
-					block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]);
-					block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]);
-					block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]);
-				}
-				
-				half_float unpacked_astc_blk_rgba[4][4][4];
-				bool res = astc_helpers::decode_block(results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
-				assert(res);
-
-				half_float unpacked_astc_blk_rgb[4][4][3];
-				for (uint32_t y = 0; y < 4; y++)
-					for (uint32_t x = 0; x < 4; x++)
-						for (uint32_t c = 0; c < 3; c++)
-							unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c];
-
-				double cmp_err = compute_block_error(&block_pixels_half[0][0], &unpacked_astc_blk_rgb[0][0][0], coptions);
-				assert(results.m_best_block_error == cmp_err);
-			}
-#endif
-
-			// transcode to BC6H
-			assert(results.m_best_blk.m_color_endpoint_modes[0] == 11);
-			
-			// Get qlog12 endpoints
-			int e[2][3];
-			bool success = decode_mode11_to_qlog12(results.m_best_blk.m_endpoints, e, results.m_best_blk.m_endpoint_ise_range);
-			assert(success);
-			BASISU_NOTE_UNUSED(success);
-
-			// Transform endpoints to half float
-			half_float h_e[3][2] =
-			{
-				{ qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) },
-				{ qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) },
-				{ qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) }
-			};
-
-			// Transcode to bc6h
-			success = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block);
-			assert(success);
-
-			all_results.push_back(results);
-		}
-	}
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static void pack_mode7_single_part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions)
-{
-	uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16];
-	uint32_t trial_submode7 = 0;
-
-	clear_obj(trial_endpoints);
-	clear_obj(trial_weights);
-
-	for (uint32_t weight_ise_range = coptions.m_first_mode7_part1_weight_ise_range; weight_ise_range <= coptions.m_last_mode7_part1_weight_ise_range; weight_ise_range++)
-	{
-		const uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS;
-
-		double trial_error = encode_astc_hdr_block_mode_7(16, pBlock_linear_colors, weight_ise_range, trial_submode7, 1e+30f, trial_endpoints, trial_weights, coptions, ise_endpoint_range);
-
-		if (trial_error < 1e+30f)
-		{
-			astc_hdr_pack_results results;
-			results.clear();
-
-			results.m_best_block_error = trial_error;
-
-			results.m_best_submodes[0] = trial_submode7;
-			
-			results.m_best_blk.m_num_partitions = 1;
-			results.m_best_blk.m_color_endpoint_modes[0] = 7;
-			results.m_best_blk.m_weight_ise_range = weight_ise_range;
-			results.m_best_blk.m_endpoint_ise_range = ise_endpoint_range;
-			
-			memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
-			memcpy(results.m_best_blk.m_weights, trial_weights, 16);
-
-			// transcode to BC6H
-			assert(results.m_best_blk.m_color_endpoint_modes[0] == 7);
-			
-			// Get qlog12 endpoints
-			int e[2][3];
-			if (!decode_mode7_to_qlog12(results.m_best_blk.m_endpoints, e, nullptr, results.m_best_blk.m_endpoint_ise_range))
-				continue;
-
-			// Transform endpoints to half float
-			half_float h_e[3][2] =
-			{
-				{ qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) },
-				{ qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) },
-				{ qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) }
-			};
-
-			// Transcode to bc6h
-			bool status = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block);
-			assert(status);
-			(void)status;
-
-			all_results.push_back(results);
-		}
-	}
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static bool estimate_partition2(const vec4F* pBlock_pixels, int* pBest_parts, uint32_t num_best_parts)
-{
-	assert(num_best_parts <= basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
-
-	vec3F training_vecs[16], mean(0.0f);
-
-	for (uint32_t i = 0; i < 16; i++)
-	{
-		vec3F& v = training_vecs[i];
-
-		v[0] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
-		v[1] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
-		v[2] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
-
-		mean += v;
-	}
-	mean *= (1.0f / 16.0f);
-
-	vec3F cluster_centroids[2] = { mean - vec3F(.1f), mean + vec3F(.1f) };
-
-	uint32_t cluster_pixels[2][16];
-	uint32_t num_cluster_pixels[2];
-	vec3F new_cluster_means[2];
-
-	for (uint32_t s = 0; s < 4; s++)
-	{
-		num_cluster_pixels[0] = 0;
-		num_cluster_pixels[1] = 0;
-
-		new_cluster_means[0].clear();
-		new_cluster_means[1].clear();
-
-		for (uint32_t i = 0; i < 16; i++)
-		{
-			float d0 = training_vecs[i].squared_distance(cluster_centroids[0]);
-			float d1 = training_vecs[i].squared_distance(cluster_centroids[1]);
-
-			if (d0 < d1)
-			{
-				cluster_pixels[0][num_cluster_pixels[0]] = i;
-				new_cluster_means[0] += training_vecs[i];
-				num_cluster_pixels[0]++;
-			}
-			else
-			{
-				cluster_pixels[1][num_cluster_pixels[1]] = i;
-				new_cluster_means[1] += training_vecs[i];
-				num_cluster_pixels[1]++;
-			}
-		}
-
-		if (!num_cluster_pixels[0] || !num_cluster_pixels[1])
-			return false;
-
-		cluster_centroids[0] = new_cluster_means[0] / (float)num_cluster_pixels[0];
-		cluster_centroids[1] = new_cluster_means[1] / (float)num_cluster_pixels[1];
-	}
-
-	int desired_parts[4][4]; // [y][x]
-	for (uint32_t p = 0; p < 2; p++)
-	{
-		for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
-		{
-			const uint32_t pix_index = cluster_pixels[p][i];
-
-			desired_parts[pix_index >> 2][pix_index & 3] = p;
-		}
-	}
-
-	uint32_t part_similarity[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2];
-
-	for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; part_index++)
-	{
-		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
-
-		int total_sim_non_inv = 0;
-		int total_sim_inv = 0;
-
-		for (uint32_t y = 0; y < 4; y++)
-		{
-			for (uint32_t x = 0; x < 4; x++)
-			{
-				int part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
-
-				if (part == desired_parts[y][x])
-					total_sim_non_inv++;
-
-				if ((part ^ 1) == desired_parts[y][x])
-					total_sim_inv++;
-			}
-		}
-
-		int total_sim = maximum(total_sim_non_inv, total_sim_inv);
-
-		part_similarity[part_index] = (total_sim << 8) | part_index;
-
-	} // part_index;
-
-	std::sort(part_similarity, part_similarity + basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
-
-	for (uint32_t i = 0; i < num_best_parts; i++)
-		pBest_parts[i] = part_similarity[(basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 - 1) - i] & 0xFF;
-
-	return true;
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static void pack_mode7_2part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions,
-	int num_estimated_partitions, const int *pEstimated_partitions,
-	uint32_t first_weight_ise_range, uint32_t last_weight_ise_range)
-{
-	assert(coptions.m_mode7_part2_part_masks);
-
-	astc_helpers::log_astc_block trial_blk;
-	clear_obj(trial_blk);
-	trial_blk.m_grid_width = 4;
-	trial_blk.m_grid_height = 4;
-
-	trial_blk.m_num_partitions = 2;
-	trial_blk.m_color_endpoint_modes[0] = 7;
-	trial_blk.m_color_endpoint_modes[1] = 7;
-
-	uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2;
-		
-	if (num_estimated_partitions)
-	{
-		first_part_index = 0;
-		last_part_index = num_estimated_partitions;
-	}
-	
-	for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter)
-	{
-		uint32_t part_index;
-		if (num_estimated_partitions)
-		{
-			part_index = pEstimated_partitions[part_index_iter];
-			assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
-		}
-		else
-		{
-			part_index = part_index_iter;
-			if (((1U << part_index) & coptions.m_mode7_part2_part_masks) == 0)
-				continue;
-		}
-								
-		const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
-		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
-		const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert;
-
-		vec4F part_pixels[2][16];
-		uint32_t pixel_part_index[4][4]; // [y][x]
-		uint32_t num_part_pixels[2] = { 0, 0 };
-
-		// Extract each subset's texels for this partition pattern
-		for (uint32_t y = 0; y < 4; y++)
-		{
-			for (uint32_t x = 0; x < 4; x++)
-			{
-				uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
-				if (invert_flag)
-					part = 1 - part;
-
-				pixel_part_index[y][x] = part;
-				part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4];
-
-				num_part_pixels[part]++;
-			}
-		}
-
-		trial_blk.m_partition_id = astc_pattern;
-				
-		for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++)
-		{
-			assert(weight_ise_range <= astc_helpers::BISE_8_LEVELS);
-
-			uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS;
-			if (weight_ise_range == astc_helpers::BISE_5_LEVELS)
-				ise_endpoint_range = astc_helpers::BISE_192_LEVELS;
-			else if (weight_ise_range == astc_helpers::BISE_6_LEVELS)
-				ise_endpoint_range = astc_helpers::BISE_128_LEVELS;
-			else if (weight_ise_range == astc_helpers::BISE_8_LEVELS)
-				ise_endpoint_range = astc_helpers::BISE_80_LEVELS;
-
-			uint8_t trial_endpoints[2][NUM_MODE7_ENDPOINTS], trial_weights[2][16];
-			uint32_t trial_submode7[2];
-
-			clear_obj(trial_endpoints);
-			clear_obj(trial_weights);
-			clear_obj(trial_submode7);
-
-			double total_trial_err = 0;
-			for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
-			{
-				total_trial_err += encode_astc_hdr_block_mode_7(
-					num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0],
-					weight_ise_range, trial_submode7[pack_part_index], 1e+30f,
-					&trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, ise_endpoint_range);
-
-			} // pack_part_index
-
-			if (total_trial_err < 1e+30f)
-			{
-				trial_blk.m_weight_ise_range = weight_ise_range;
-				trial_blk.m_endpoint_ise_range = ise_endpoint_range;
-
-				for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
-					memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE7_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE7_ENDPOINTS);
-
-				uint32_t src_pixel_index[2] = { 0, 0 };
-				for (uint32_t y = 0; y < 4; y++)
-				{
-					for (uint32_t x = 0; x < 4; x++)
-					{
-						uint32_t p = pixel_part_index[y][x];
-						trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++];
-					}
-				}
-								
-				astc_hdr_pack_results results;
-				results.clear();
-
-				results.m_best_block_error = total_trial_err;
-				results.m_best_submodes[0] = trial_submode7[0];
-				results.m_best_submodes[1] = trial_submode7[1];
-				results.m_best_pat_index = part_index;
-
-				results.m_best_blk = trial_blk;
-
-				bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block);
-				assert(status);
-				BASISU_NOTE_UNUSED(status);
-
-				all_results.push_back(results);
-			}
-
-		} // weight_ise_range
-
-	} // part_index
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-static void pack_mode11_2part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions,
-	int num_estimated_partitions, const int* pEstimated_partitions)
-{
-	assert(coptions.m_mode11_part2_part_masks);
-
-	astc_helpers::log_astc_block trial_blk;
-	clear_obj(trial_blk);
-	trial_blk.m_grid_width = 4;
-	trial_blk.m_grid_height = 4;
-
-	trial_blk.m_num_partitions = 2;
-	trial_blk.m_color_endpoint_modes[0] = 11;
-	trial_blk.m_color_endpoint_modes[1] = 11;
-			
-	uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2;
-
-	if (num_estimated_partitions)
-	{
-		first_part_index = 0;
-		last_part_index = num_estimated_partitions;
-	}
-
-	for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter)
-	{
-		uint32_t part_index;
-		if (num_estimated_partitions)
-		{
-			part_index = pEstimated_partitions[part_index_iter];
-			assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
-		}
-		else
-		{
-			part_index = part_index_iter;
-			if (((1U << part_index) & coptions.m_mode11_part2_part_masks) == 0)
-				continue;
-		}
-
-		const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
-		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
-		const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert;
-
-		vec4F part_pixels[2][16];
-		uint32_t pixel_part_index[4][4]; // [y][x]
-		uint32_t num_part_pixels[2] = { 0, 0 };
-
-		// Extract each subset's texels for this partition pattern
-		for (uint32_t y = 0; y < 4; y++)
-		{
-			for (uint32_t x = 0; x < 4; x++)
-			{
-				uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
-				if (invert_flag)
-					part = 1 - part;
-
-				pixel_part_index[y][x] = part;
-				part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4];
-
-				num_part_pixels[part]++;
-			}
-		}
-				
-		trial_blk.m_partition_id = astc_pattern;
-						
-		for (uint32_t weight_ise_range = coptions.m_first_mode11_part2_weight_ise_range; weight_ise_range <= coptions.m_last_mode11_part2_weight_ise_range; weight_ise_range++)
-		{
-			bool direct_only = false;
-			uint32_t ise_endpoint_range = astc_helpers::BISE_64_LEVELS;
-			if (weight_ise_range == astc_helpers::BISE_4_LEVELS)
-				ise_endpoint_range = astc_helpers::BISE_40_LEVELS;
-
-			uint8_t trial_endpoints[2][NUM_MODE11_ENDPOINTS], trial_weights[2][16];
-			uint32_t trial_submode11[2];
-
-			clear_obj(trial_endpoints); 
-			clear_obj(trial_weights);
-			clear_obj(trial_submode11);
-
-			double total_trial_err = 0;
-			for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
-			{
-				total_trial_err += encode_astc_hdr_block_mode_11(
-					num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0],
-					weight_ise_range, trial_submode11[pack_part_index], 1e+30f,
-					&trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions,
-					direct_only, ise_endpoint_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, false,
-					coptions.m_first_mode11_submode, coptions.m_last_mode11_submode);
-
-			} // pack_part_index
-
-			if (total_trial_err < 1e+30f)
-			{
-				trial_blk.m_weight_ise_range = weight_ise_range;
-				trial_blk.m_endpoint_ise_range = ise_endpoint_range;
-
-				for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
-					memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE11_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE11_ENDPOINTS);
-
-				uint32_t src_pixel_index[2] = { 0, 0 };
-				for (uint32_t y = 0; y < 4; y++)
-				{
-					for (uint32_t x = 0; x < 4; x++)
-					{
-						uint32_t p = pixel_part_index[y][x];
-						trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++];
-					}
-				}
-								
-				astc_hdr_pack_results results;
-				results.clear();
-
-				results.m_best_block_error = total_trial_err;
-				results.m_best_submodes[0] = trial_submode11[0];
-				results.m_best_submodes[1] = trial_submode11[1];
-				results.m_best_pat_index = part_index;
-
-				results.m_best_blk = trial_blk;
-
-				bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block);
-				assert(status);
-				BASISU_NOTE_UNUSED(status);
-
-				all_results.push_back(results);
-			}
-
-		} // weight_ise_range
-
-	} // part_index
-}
-
-//--------------------------------------------------------------------------------------------------------------------------
-
-bool g_astc_hdr_enc_initialized;
-
-void astc_hdr_enc_init()
-{
-	if (g_astc_hdr_enc_initialized)
-		return;
-
-	astc_hdr_core_init();
-
-	astc_helpers::init_tables(true);
-			
-	init_qlog_tables();
-
-	encode_astc_hdr_init();
-								
-	g_astc_hdr_enc_initialized = true;
-}
-
-bool astc_hdr_enc_block(
-	const float* pRGBPixels, 
-	const astc_hdr_codec_options& coptions,
-	basisu::vector<astc_hdr_pack_results>& all_results)
-{
-	assert(g_astc_hdr_enc_initialized);
-	if (!g_astc_hdr_enc_initialized)
-	{
-		// astc_hdr_enc_init() MUST be called first.
-		assert(0);
-		return false;
-	}
-
-	all_results.resize(0);
-				
-	vec4F block_linear_colors[16];
-
-	// Sanity check the input block.
-	for (uint32_t i = 0; i < 16; i++)
-	{
-		for (uint32_t j = 0; j < 3; j++)
-		{
-			float v = pRGBPixels[i * 3 + j];
-
-			if (std::isinf(v) || std::isnan(v))
-			{
-				// Input pixels cannot be NaN or +-Inf.
-				assert(0);
-				return false;
-			}
-
-			if (v < 0.0f)
-			{
-				// Input pixels cannot be signed.
-				assert(0);
-				return false;
-			}
-
-			if (v > MAX_HALF_FLOAT)
-			{
-				// Too large for half float.
-				assert(0);
-				return false;
-			}
-			
-			block_linear_colors[i][j] = v;
-		}
-		
-		block_linear_colors[i][3] = 1.0f;
-	}
-
-	assert(coptions.m_use_solid || coptions.m_use_mode11 || coptions.m_use_mode7_part2 || coptions.m_use_mode7_part1 || coptions.m_use_mode11_part2);
-					
-	bool is_solid = false;
-	if (coptions.m_use_solid)
-		is_solid = pack_solid(block_linear_colors, all_results, coptions);
-
-	if (!is_solid)
-	{
-		if (coptions.m_use_mode11)
-		{
-			const size_t cur_num_results = all_results.size();
-
-			pack_mode11(block_linear_colors, all_results, coptions, coptions.m_first_mode11_weight_ise_range, coptions.m_last_mode11_weight_ise_range, false);
-
-			if (coptions.m_last_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS)
-			{
-				pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_16_LEVELS, true);
-			}
-
-			// If we couldn't get any mode 11 results at all, and we were restricted to just trying weight ISE range 8 (which required endpoint quantization) then 
-			// fall back to weight ISE range 7 (which doesn't need any endpoint quantization).
-			// This is to guarantee we always get at least 1 non-solid result.
-			if (all_results.size() == cur_num_results)
-			{
-				if (coptions.m_first_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS)
-				{
-					pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_12_LEVELS, astc_helpers::BISE_12_LEVELS, false);
-				}
-			}
-		}
-				
-		if (coptions.m_use_mode7_part1)
-		{
-			// Mode 7 1-subset never requires endpoint quantization, so it cannot fail to find at least one usable solution.
-			pack_mode7_single_part(block_linear_colors, all_results, coptions);
-		}
-				
-		bool have_est = false;
-		int best_parts[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2];
-
-		if ((coptions.m_use_mode7_part2) || (coptions.m_use_mode11_part2))
-		{
-			if (coptions.m_use_estimated_partitions)
-				have_est = estimate_partition2(block_linear_colors, best_parts, coptions.m_max_estimated_partitions);
-		}
-
-		if (coptions.m_use_mode7_part2)
-		{
-			const size_t cur_num_results = all_results.size();
-
-			pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, 
-				coptions.m_first_mode7_part2_weight_ise_range, coptions.m_last_mode7_part2_weight_ise_range);
-
-			// If we couldn't find any packable 2-subset mode 7 results at weight levels >= 5 levels (which always requires endpoint quant), then try falling back to 
-			// 5 levels which doesn't require endpoint quantization.
-			if (all_results.size() == cur_num_results)
-			{
-				if (coptions.m_first_mode7_part2_weight_ise_range >= astc_helpers::BISE_5_LEVELS)
-				{
-					pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, 
-						astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_4_LEVELS);
-				}
-			}
-		}
-		
-		if (coptions.m_use_mode11_part2)
-		{
-			// This always requires endpoint quant, so it could fail to find any usable solutions.
-			pack_mode11_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts);
-		}
-	}
-
-	if (coptions.m_refine_weights)
-	{
-		// TODO: Move this above, do it once only.
-		basist::half_float rgb_pixels_half[16 * 3];
-		for (uint32_t i = 0; i < 16; i++)
-		{
-			rgb_pixels_half[i * 3 + 0] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 0]);
-			rgb_pixels_half[i * 3 + 1] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 1]);
-			rgb_pixels_half[i * 3 + 2] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 2]);
-		}
-
-		for (uint32_t i = 0; i < all_results.size(); i++)
-		{
-			bool status = astc_hdr_refine_weights(rgb_pixels_half, all_results[i], coptions, coptions.m_bc6h_err_weight, &all_results[i].m_improved_via_refinement_flag);
-			assert(status);
-			BASISU_NOTE_UNUSED(status);
-		}
-	}
-
-	return true;
-}
-
-bool astc_hdr_pack_results_to_block(astc_blk& dst_blk, const astc_hdr_pack_results& results)
-{
-	assert(g_astc_hdr_enc_initialized);
-	if (!g_astc_hdr_enc_initialized)
-		return false;
-
-	if (results.m_is_solid)
-	{
-		memcpy(&dst_blk, &results.m_solid_blk, sizeof(results.m_solid_blk));
-	}
-	else
-	{
-		bool status = astc_helpers::pack_astc_block((astc_helpers::astc_block&)dst_blk, results.m_best_blk);
-		if (!status)
-		{
-			assert(0);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-// Refines a block's chosen weight indices, balancing BC6H and ASTC HDR error.
-bool astc_hdr_refine_weights(const half_float *pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool *pImproved_flag)
-{
-	if (pImproved_flag)
-		*pImproved_flag = false;
-
-	if (cur_results.m_is_solid)
-		return true;
-
-	const uint32_t total_weights = astc_helpers::get_ise_levels(cur_results.m_best_blk.m_weight_ise_range);
-
-	assert((total_weights >= 3) && (total_weights <= 16));
-
-	double best_err[4][4];
-	uint8_t best_weight[4][4];
-	for (uint32_t y = 0; y < 4; y++)
-	{
-		for (uint32_t x = 0; x < 4; x++)
-		{
-			best_err[y][x] = 1e+30f;
-			best_weight[y][x] = 0;
-		}
-	}
-
-	astc_hdr_pack_results temp_results;
-
-	const float c_weights[3] = { coptions.m_r_err_scale, coptions.m_g_err_scale, 1.0f };
-
-	for (uint32_t weight_index = 0; weight_index < total_weights; weight_index++)
-	{
-		temp_results = cur_results;
-		for (uint32_t i = 0; i < 16; i++)
-			temp_results.m_best_blk.m_weights[i] = (uint8_t)weight_index;
-		
-		half_float unpacked_astc_blk_rgba[4][4][4];
-		bool res = astc_helpers::decode_block(temp_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
-		assert(res);
-
-		basist::bc6h_block trial_bc6h_blk;
-		res = basist::astc_hdr_transcode_to_bc6h(temp_results.m_best_blk, trial_bc6h_blk);
-		assert(res);
-				
-		half_float unpacked_bc6h_blk[4][4][3];
-		res = unpack_bc6h(&trial_bc6h_blk, unpacked_bc6h_blk, false);
-		assert(res);
-		BASISU_NOTE_UNUSED(res);
-
-		for (uint32_t y = 0; y < 4; y++)
-		{
-			for (uint32_t x = 0; x < 4; x++)
-			{
-				double total_err = 0.0f;
-
-				for (uint32_t c = 0; c < 3; c++)
-				{
-					const half_float orig_c = pSource_block[(x + y * 4) * 3 + c];
-					const double orig_c_q = q(orig_c);
-					
-					const half_float astc_c = unpacked_astc_blk_rgba[y][x][c];
-					const double astc_c_q = q(astc_c);
-					const double astc_e = square(astc_c_q - orig_c_q) * c_weights[c];
-					
-					const half_float bc6h_c = unpacked_bc6h_blk[y][x][c];
-					const double bc6h_c_q = q(bc6h_c);
-					const double bc6h_e = square(bc6h_c_q - orig_c_q) * c_weights[c];
-
-					const double overall_err = astc_e * (1.0f - bc6h_weight) + bc6h_e * bc6h_weight;
-
-					total_err += overall_err;
-
-				} //  c
-
-				if (total_err < best_err[y][x])
-				{
-					best_err[y][x] = total_err;
-					best_weight[y][x] = (uint8_t)weight_index;
-				}
-
-			} // x
-		} // y
-
-	} // weight_index
-
-	bool any_changed = false;
-	for (uint32_t i = 0; i < 16; i++)
-	{
-		if (cur_results.m_best_blk.m_weights[i] != best_weight[i >> 2][i & 3])
-		{
-			any_changed = true;
-			break;
-		}
-	}
-
-	if (any_changed)
-	{
-		memcpy(cur_results.m_best_blk.m_weights, best_weight, 16);
-
-		{
-			bool res = basist::astc_hdr_transcode_to_bc6h(cur_results.m_best_blk, cur_results.m_bc6h_block);
-			assert(res);
-			BASISU_NOTE_UNUSED(res);
-
-			half_float unpacked_astc_blk_rgba[4][4][4];
-			res = astc_helpers::decode_block(cur_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
-			assert(res);
-
-			half_float unpacked_astc_blk_rgb[4][4][3];
-			for (uint32_t y = 0; y < 4; y++)
-				for (uint32_t x = 0; x < 4; x++)
-					for (uint32_t c = 0; c < 3; c++)
-						unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c];
-
-			cur_results.m_best_block_error = compute_block_error(pSource_block, &unpacked_astc_blk_rgb[0][0][0], coptions);
-		}
-
-		if (pImproved_flag)
-			*pImproved_flag = true;
-	}
-
-	return true;
-}
-
-void astc_hdr_block_stats::update(const astc_hdr_pack_results& log_blk)
-{
-	std::lock_guard<std::mutex> lck(m_mutex);
-
-	m_total_blocks++;
-
-	if (log_blk.m_improved_via_refinement_flag)
-		m_total_refined++;
-
-	if (log_blk.m_is_solid)
-	{
-		m_total_solid++;
-	}
-	else
-	{
-		int best_weight_range = log_blk.m_best_blk.m_weight_ise_range;
-
-		if (log_blk.m_best_blk.m_color_endpoint_modes[0] == 7)
-		{
-			m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 6U)]++;
-
-			if (log_blk.m_best_blk.m_num_partitions == 2)
-			{
-				m_total_mode7_2part++;
-
-				m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 6U)]++;
-				m_total_2part++;
-
-				m_weight_range_hist_7_2part[bounds_check(best_weight_range, 0, 11)]++;
-
-				m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++;
-			}
-			else
-			{
-				m_total_mode7_1part++;
-
-				m_weight_range_hist_7[bounds_check(best_weight_range, 0, 11)]++;
-			}
-		}
-		else
-		{
-			m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 9U)]++;
-			if (log_blk.m_constrained_weights)
-				m_total_mode11_1part_constrained_weights++;
-
-			if (log_blk.m_best_blk.m_num_partitions == 2)
-			{
-				m_total_mode11_2part++;
-
-				m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 9U)]++;
-				m_total_2part++;
-
-				m_weight_range_hist_11_2part[bounds_check(best_weight_range, 0, 11)]++;
-
-				m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++;
-			}
-			else
-			{
-				m_total_mode11_1part++;
-
-				m_weight_range_hist_11[bounds_check(best_weight_range, 0, 11)]++;
-			}
-		}
-	}
-}
-
-void astc_hdr_block_stats::print()
-{
-	std::lock_guard<std::mutex> lck(m_mutex);
-
-	assert(m_total_blocks);
-	if (!m_total_blocks)
-		return;
-
-	printf("\nLow-level ASTC Encoder Statistics:\n");
-	printf("Total blocks: %u\n", m_total_blocks);
-	printf("Total solid: %u %3.2f%%\n", m_total_solid, (m_total_solid * 100.0f) / m_total_blocks);
-	printf("Total refined: %u %3.2f%%\n", m_total_refined, (m_total_refined * 100.0f) / m_total_blocks);
-
-	printf("Total mode 11, 1 partition: %u %3.2f%%\n", m_total_mode11_1part, (m_total_mode11_1part * 100.0f) / m_total_blocks);
-	printf("Total mode 11, 1 partition, constrained weights: %u %3.2f%%\n", m_total_mode11_1part_constrained_weights, (m_total_mode11_1part_constrained_weights * 100.0f) / m_total_blocks);
-	printf("Total mode 11, 2 partition: %u %3.2f%%\n", m_total_mode11_2part, (m_total_mode11_2part * 100.0f) / m_total_blocks);
-
-	printf("Total mode 7, 1 partition: %u %3.2f%%\n", m_total_mode7_1part, (m_total_mode7_1part * 100.0f) / m_total_blocks);
-	printf("Total mode 7, 2 partition: %u %3.2f%%\n", m_total_mode7_2part, (m_total_mode7_2part * 100.0f) / m_total_blocks);
-
-	printf("Total 2 partitions: %u %3.2f%%\n", m_total_2part, (m_total_2part * 100.0f) / m_total_blocks);
-	printf("\n");
-
-	printf("ISE texel weight range histogram mode 11:\n");
-	for (uint32_t i = 1; i <= MODE11_LAST_ISE_RANGE; i++)
-		printf("%u %u\n", i, m_weight_range_hist_11[i]);
-	printf("\n");
-
-	printf("ISE texel weight range histogram mode 11, 2 partition:\n");
-	for (uint32_t i = 1; i <= MODE11_PART2_LAST_ISE_RANGE; i++)
-		printf("%u %u\n", i, m_weight_range_hist_11_2part[i]);
-	printf("\n");
-
-	printf("ISE texel weight range histogram mode 7:\n");
-	for (uint32_t i = 1; i <= MODE7_PART1_LAST_ISE_RANGE; i++)
-		printf("%u %u\n", i, m_weight_range_hist_7[i]);
-	printf("\n");
-
-	printf("ISE texel weight range histogram mode 7, 2 partition:\n");
-	for (uint32_t i = 1; i <= MODE7_PART2_LAST_ISE_RANGE; i++)
-		printf("%u %u\n", i, m_weight_range_hist_7_2part[i]);
-	printf("\n");
-
-	printf("Mode 11 submode histogram:\n");
-	for (uint32_t i = 0; i <= MODE11_TOTAL_SUBMODES; i++) // +1 because of the extra direct encoding
-		printf("%u %u\n", i, m_mode11_submode_hist[i]);
-	printf("\n");
-
-	printf("Mode 7 submode histogram:\n");
-	for (uint32_t i = 0; i < MODE7_TOTAL_SUBMODES; i++)
-		printf("%u %u\n", i, m_mode7_submode_hist[i]);
-	printf("\n");
-
-	printf("Partition pattern table usage histogram:\n");
-	for (uint32_t i = 0; i < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2; i++)
-		printf("%u:%u ", i, m_part_hist[i]);
-	printf("\n\n");
-}
-
-} // namespace basisu
-

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 396 - 161
thirdparty/basis_universal/encoder/basisu_comp.cpp


+ 209 - 64
thirdparty/basis_universal/encoder/basisu_comp.h

@@ -18,10 +18,11 @@
 #include "basisu_basis_file.h"
 #include "../transcoder/basisu_transcoder.h"
 #include "basisu_uastc_enc.h"
-#include "basisu_astc_hdr_enc.h"
+#include "basisu_uastc_hdr_4x4_enc.h"
+#include "basisu_astc_hdr_6x6_enc.h"
 
-#define BASISU_LIB_VERSION 150
-#define BASISU_LIB_VERSION_STRING "1.50"
+#define BASISU_LIB_VERSION 160
+#define BASISU_LIB_VERSION_STRING "1.60"
 
 #ifndef BASISD_SUPPORT_KTX2
 	#error BASISD_SUPPORT_KTX2 is undefined
@@ -76,6 +77,8 @@ namespace basisu
 			m_height = 0;
 						
 			m_basis_rgb_avg_psnr = 0.0f;
+			m_basis_rgb_avg_log2_psnr = 0.0f;
+
 			m_basis_rgba_avg_psnr = 0.0f;
 			m_basis_a_avg_psnr = 0.0f;
 			m_basis_luma_709_psnr = 0.0f;
@@ -83,6 +86,7 @@ namespace basisu
 			m_basis_luma_709_ssim = 0.0f;
 
 			m_basis_rgb_avg_bc6h_psnr = 0.0f;
+			m_basis_rgb_avg_bc6h_log2_psnr = 0.0f;
 
 			m_bc7_rgb_avg_psnr = 0.0f;
 			m_bc7_rgba_avg_psnr = 0.0f;
@@ -105,6 +109,8 @@ namespace basisu
 
 		// .basis/.ktx2 compressed (LDR: ETC1S or UASTC statistics, HDR: transcoded BC6H statistics)
 		float m_basis_rgb_avg_psnr;
+		float m_basis_rgb_avg_log2_psnr;
+
 		float m_basis_rgba_avg_psnr;
 		float m_basis_a_avg_psnr;
 		float m_basis_luma_709_psnr;
@@ -113,6 +119,7 @@ namespace basisu
 
 		// UASTC HDR only.
 		float m_basis_rgb_avg_bc6h_psnr;
+		float m_basis_rgb_avg_bc6h_log2_psnr;
 
 		// LDR: BC7 statistics
 		float m_bc7_rgb_avg_psnr;
@@ -131,6 +138,17 @@ namespace basisu
 		bool m_opencl_failed;
 	};
 
+	enum class hdr_modes
+	{
+		// standard but constrained ASTC HDR 4x4 tex data that can be rapidly transcoded to BC6H
+		cUASTC_HDR_4X4, 
+		// standard RDO optimized or non-RDO (highest quality) ASTC HDR 6x6 tex data that can be rapidly re-encoded to BC6H
+		cASTC_HDR_6X6,
+		// a custom intermediate format based off ASTC HDR that can be rapidly decoded straight to ASTC HDR or re-encoded to BC6H
+		cASTC_HDR_6X6_INTERMEDIATE,
+		cTotal
+	};
+
 	template<bool def>
 	struct bool_param
 	{
@@ -220,21 +238,23 @@ namespace basisu
 			m_endpoint_rdo_thresh(BASISU_DEFAULT_ENDPOINT_RDO_THRESH, 0.0f, 1e+10f),
 			m_mip_scale(1.0f, .000125f, 4.0f),
 			m_mip_smallest_dimension(1, 1, 16384),
-			m_max_endpoint_clusters(512),
-			m_max_selector_clusters(512),
-			m_quality_level(-1),
-			m_pack_uastc_flags(cPackUASTCLevelDefault),
-			m_rdo_uastc_quality_scalar(1.0f, 0.001f, 50.0f),
-			m_rdo_uastc_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX),
-			m_rdo_uastc_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f),
-			m_rdo_uastc_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f),
-			m_rdo_uastc_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f),
-			m_rdo_uastc_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f),
+			m_etc1s_max_endpoint_clusters(512),
+			m_etc1s_max_selector_clusters(512),
+			m_etc1s_quality_level(-1),
+			m_pack_uastc_ldr_4x4_flags(cPackUASTCLevelDefault),
+			m_rdo_uastc_ldr_4x4_quality_scalar(1.0f, 0.001f, 50.0f),
+			m_rdo_uastc_ldr_4x4_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX),
+			m_rdo_uastc_ldr_4x4_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f),
+			m_rdo_uastc_ldr_4x4_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f),
+			m_rdo_uastc_ldr_4x4_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f),
+			m_rdo_uastc_ldr_4x4_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f),
 			m_resample_width(0, 1, 16384),
 			m_resample_height(0, 1, 16384),
 			m_resample_factor(0.0f, .00125f, 100.0f),
 			m_ktx2_uastc_supercompression(basist::KTX2_SS_NONE),
 			m_ktx2_zstd_supercompression_level(6, INT_MIN, INT_MAX),
+			m_ldr_hdr_upconversion_nit_multiplier(0.0f, 0.0f, basist::MAX_HALF_FLOAT),
+			m_ldr_hdr_upconversion_black_bias(0.0f, 0.0f, 1.0f),
 			m_pJob_pool(nullptr)
 		{
 			clear();
@@ -243,6 +263,9 @@ namespace basisu
 		void clear()
 		{
 			m_uastc.clear();
+			m_hdr.clear();
+			m_hdr_mode = hdr_modes::cUASTC_HDR_4X4;
+
 			m_use_opencl.clear();
 			m_status_output.clear();
 
@@ -290,24 +313,24 @@ namespace basisu
 			m_mip_fast.clear();
 			m_mip_smallest_dimension.clear();
 
-			m_max_endpoint_clusters = 0;
-			m_max_selector_clusters = 0;
-			m_quality_level = -1;
+			m_etc1s_max_endpoint_clusters = 0;
+			m_etc1s_max_selector_clusters = 0;
+			m_etc1s_quality_level = -1;
 
 			m_tex_type = basist::cBASISTexType2D;
 			m_userdata0 = 0;
 			m_userdata1 = 0;
 			m_us_per_frame = 0;
 
-			m_pack_uastc_flags = cPackUASTCLevelDefault;
-			m_rdo_uastc.clear();
-			m_rdo_uastc_quality_scalar.clear();
-			m_rdo_uastc_max_smooth_block_error_scale.clear();
-			m_rdo_uastc_smooth_block_max_std_dev.clear();
-			m_rdo_uastc_max_allowed_rms_increase_ratio.clear();
-			m_rdo_uastc_skip_block_rms_thresh.clear();
-			m_rdo_uastc_favor_simpler_modes_in_rdo_mode.clear();
-			m_rdo_uastc_multithreading.clear();
+			m_pack_uastc_ldr_4x4_flags = cPackUASTCLevelDefault;
+			m_rdo_uastc_ldr_4x4.clear();
+			m_rdo_uastc_ldr_4x4_quality_scalar.clear();
+			m_rdo_uastc_ldr_4x4_max_smooth_block_error_scale.clear();
+			m_rdo_uastc_ldr_4x4_smooth_block_max_std_dev.clear();
+			m_rdo_uastc_ldr_4x4_max_allowed_rms_increase_ratio.clear();
+			m_rdo_uastc_ldr_4x4_skip_block_rms_thresh.clear();
+			m_rdo_uastc_ldr_4x4_favor_simpler_modes_in_rdo_mode.clear();
+			m_rdo_uastc_ldr_4x4_multithreading.clear();
 
 			m_resample_width.clear();
 			m_resample_height.clear();
@@ -323,19 +346,80 @@ namespace basisu
 
 			m_validate_output_data.clear();
 
-			m_hdr_ldr_srgb_to_linear_conversion.clear();
+			m_ldr_hdr_upconversion_srgb_to_linear.clear();
 
 			m_hdr_favor_astc.clear();
 			
+			m_uastc_hdr_4x4_options.init();
+			m_astc_hdr_6x6_options.clear();
+
+			m_ldr_hdr_upconversion_nit_multiplier.clear();
+			m_ldr_hdr_upconversion_black_bias.clear();
+			
 			m_pJob_pool = nullptr;
 		}
-						
+
+		// Configures the compressor's mode by setting the proper parameters (which were preserved for backwards compatibility with old code).
+		void set_format_mode(basist::basis_tex_format m)
+		{
+			switch (m)
+			{
+			case basist::basis_tex_format::cETC1S:
+			{
+				m_hdr = false;
+				m_uastc = false;
+				m_hdr_mode = hdr_modes::cUASTC_HDR_4X4; // doesn't matter
+				break;
+			}
+			case basist::basis_tex_format::cUASTC4x4:
+			{
+				m_hdr = false;
+				m_uastc = true;
+				m_hdr_mode = hdr_modes::cUASTC_HDR_4X4; // doesn't matter
+				break;
+			}
+			case basist::basis_tex_format::cUASTC_HDR_4x4:
+			{
+				m_hdr = true;
+				m_uastc = true;
+				m_hdr_mode = hdr_modes::cUASTC_HDR_4X4;
+				break;
+			}
+			case basist::basis_tex_format::cASTC_HDR_6x6:
+			{
+				m_hdr = true;
+				m_uastc = true;
+				m_hdr_mode = hdr_modes::cASTC_HDR_6X6;
+				break;
+			}
+			case basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE:
+			{
+				m_hdr = true;
+				m_uastc = true;
+				m_hdr_mode = hdr_modes::cASTC_HDR_6X6_INTERMEDIATE;
+				break;
+			}
+			default:
+				assert(0);
+				break;
+			}
+		}
+
+		// By default we generate LDR ETC1S data. 
+		// if m_uastc is true but m_hdr is not true, we generate UASTC 4x4 LDR data (8bpp with or without RDO).
+		// if m_uastc is true and m_hdr is true, we generate 4x4 or 6x6 HDR data (either standard ASTC, constrained ASTC, RDO ASTC, or intermediate), controlled by m_hdr_mode.
+		
 		// True to generate UASTC .basis/.KTX2 file data, otherwise ETC1S.
+		// Should be true for any non-ETC1S format (UASTC 4x4 LDR, UASTC 4x4 HDR, RDO ASTC 6x6 HDR, and ASTC 6x6 HDR intermediate).
 		bool_param<false> m_uastc;
 
-		// Set m_hdr to true to switch to UASTC HDR mode.
+		// Set m_hdr to true to switch to UASTC HDR mode. m_hdr_mode then controls which format is output.
+		// m_hdr_mode then controls which format is output (4x4, 6x6, or 6x6 intermediate).
 		bool_param<false> m_hdr;
 
+		// If m_hdr is true, this specifies which mode we operate in (currently UASTC 4x4 HDR or ASTC 6x6 HDR). Defaults to UASTC 4x4 HDR for backwards compatibility.
+		hdr_modes m_hdr_mode;
+				
 		bool_param<false> m_use_opencl;
 
 		// If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG etc. images to read. 
@@ -426,30 +510,31 @@ namespace basisu
 		bool_param<true> m_mip_fast;
 		param<int> m_mip_smallest_dimension;
 						
-		// Codebook size (quality) control. 
-		// If m_quality_level != -1, it controls the quality level. It ranges from [1,255] or [BASISU_QUALITY_MIN, BASISU_QUALITY_MAX].
+		// ETC1S codebook size (quality) control. 
+		// If m_etc1s_quality_level != -1, it controls the quality level. It ranges from [1,255] or [BASISU_QUALITY_MIN, BASISU_QUALITY_MAX].
 		// Otherwise m_max_endpoint_clusters/m_max_selector_clusters controls the codebook sizes directly.
-		uint32_t m_max_endpoint_clusters;
-		uint32_t m_max_selector_clusters;
-		int m_quality_level;
+		uint32_t m_etc1s_max_endpoint_clusters;
+		uint32_t m_etc1s_max_selector_clusters;
+		int m_etc1s_quality_level;
 		
-		// m_tex_type, m_userdata0, m_userdata1, m_framerate - These fields go directly into the Basis file header.
+		// m_tex_type, m_userdata0, m_userdata1, m_framerate - These fields go directly into the .basis file header.
 		basist::basis_texture_type m_tex_type;
 		uint32_t m_userdata0;
 		uint32_t m_userdata1;
 		uint32_t m_us_per_frame;
 
+		// UASTC LDR 4x4 parameters
 		// cPackUASTCLevelDefault, etc.
-		uint32_t m_pack_uastc_flags;
-		bool_param<false> m_rdo_uastc;
-		param<float> m_rdo_uastc_quality_scalar;
-		param<int> m_rdo_uastc_dict_size;
-		param<float> m_rdo_uastc_max_smooth_block_error_scale;
-		param<float> m_rdo_uastc_smooth_block_max_std_dev;
-		param<float> m_rdo_uastc_max_allowed_rms_increase_ratio;
-		param<float> m_rdo_uastc_skip_block_rms_thresh;
-		bool_param<true> m_rdo_uastc_favor_simpler_modes_in_rdo_mode;
-		bool_param<true> m_rdo_uastc_multithreading;
+		uint32_t m_pack_uastc_ldr_4x4_flags;
+		bool_param<false> m_rdo_uastc_ldr_4x4;
+		param<float> m_rdo_uastc_ldr_4x4_quality_scalar;
+		param<int> m_rdo_uastc_ldr_4x4_dict_size;
+		param<float> m_rdo_uastc_ldr_4x4_max_smooth_block_error_scale;
+		param<float> m_rdo_uastc_ldr_4x4_smooth_block_max_std_dev;
+		param<float> m_rdo_uastc_ldr_4x4_max_allowed_rms_increase_ratio;
+		param<float> m_rdo_uastc_ldr_4x4_skip_block_rms_thresh;
+		bool_param<true> m_rdo_uastc_ldr_4x4_favor_simpler_modes_in_rdo_mode;
+		bool_param<true> m_rdo_uastc_ldr_4x4_multithreading;
 
 		param<int> m_resample_width;
 		param<int> m_resample_height;
@@ -465,13 +550,26 @@ namespace basisu
 		param<int> m_ktx2_zstd_supercompression_level;
 		bool_param<false> m_ktx2_srgb_transfer_func;
 
-		astc_hdr_codec_options m_uastc_hdr_options;
+		uastc_hdr_4x4_codec_options m_uastc_hdr_4x4_options;
+		astc_6x6_hdr::astc_hdr_6x6_global_config m_astc_hdr_6x6_options;
 
 		bool_param<false> m_validate_output_data;
 
-		// If true, LDR images (such as PNG) will be converted to normalized [0,1] linear light (via a sRGB->Linear conversion) and then processed as HDR. 
-		// Otherwise, LDR images will be processed as HDR as-is.
-		bool_param<true> m_hdr_ldr_srgb_to_linear_conversion;
+		// LDR->HDR upconversion parameters.
+		// 
+		// If true, LDR images (such as PNG) will be converted to normalized [0,1] linear light (via a sRGB->Linear conversion), or absolute luminance (nits or candelas per meter squared), and then processed as HDR. 
+		// Otherwise, LDR images are assumed to already be in linear light (i.e. they don't use the sRGB transfer function).
+		bool_param<true> m_ldr_hdr_upconversion_srgb_to_linear;
+		
+		// m_ldr_hdr_upconversion_nit_multiplier is only used when loading SDR/LDR images and compressing to an HDR output format.
+		// By default m_ldr_hdr_upconversion_nit_multiplier is 0. It's an override for the default.
+		// When loading LDR images, a default multiplier of 1.0 will be used in UASTC 4x4 HDR mode. Partially for backwards compatibility with previous library releases, and also because it doesn't really matter with this encoder what the multiplier is.
+		// With the 6x6 HDR encoder it does matter because it expects inputs in absolute nits, so the LDR upconversion luminance multiplier default will be 100 nits. (Most SDR monitors were/are 80-100 nits or so.)
+		param<float> m_ldr_hdr_upconversion_nit_multiplier;
+
+		// The optional sRGB space bias to use during LDR->HDR upconversion. Should be between [0,.49] or so. Only applied on black (0.0) color components.
+		// Defaults to no bias (0.0f).
+		param<float> m_ldr_hdr_upconversion_black_bias;
 
 		// If true, ASTC HDR quality is favored more than BC6H quality. Otherwise it's a rough balance.
 		bool_param<false> m_hdr_favor_astc;
@@ -526,8 +624,10 @@ namespace basisu
 								
 	private:
 		basis_compressor_params m_params;
-
+				
 		opencl_context_ptr m_pOpenCL_context;
+
+		basist::basis_tex_format m_fmt_mode;
 		
 		basisu::vector<image> m_slice_images;
 		basisu::vector<imagef> m_slice_images_hdr;
@@ -543,6 +643,7 @@ namespace basisu
 		
 		basisu_frontend m_frontend;
 
+		// These are 4x4 blocks.
 		pixel_block_vec m_source_blocks;
 		pixel_block_hdr_vec m_source_blocks_hdr;
 
@@ -572,6 +673,15 @@ namespace basisu
 		basisu::vector<gpu_image> m_uastc_slice_textures;
 		basisu_backend_output m_uastc_backend_output;
 
+		// The amount the HDR input has to be scaled up in case it had to be rescaled to fit into half floats.
+		float m_hdr_image_scale; 
+		
+		// The upconversion multiplier used to load LDR images in HDR mode.
+		float m_ldr_to_hdr_upconversion_nit_multiplier;
+		
+		// True if any loaded source images were LDR and upconverted to HDR.
+		bool m_upconverted_any_ldr_images;
+
 		bool m_any_source_image_has_alpha;
 
 		bool m_opencl_failed;
@@ -588,14 +698,48 @@ namespace basisu
 		bool create_basis_file_and_transcode();
 		bool write_hdr_debug_images(const char* pBasename, const imagef& img, uint32_t width, uint32_t height);
 		bool write_output_files_and_compute_stats();
-		error_code encode_slices_to_uastc_hdr();
-		error_code encode_slices_to_uastc();
+		error_code encode_slices_to_astc_6x6_hdr();
+		error_code encode_slices_to_uastc_4x4_hdr();
+		error_code encode_slices_to_uastc_4x4_ldr();
 		bool generate_mipmaps(const imagef& img, basisu::vector<imagef>& mips, bool has_alpha);
 		bool generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha);
 		bool validate_texture_type_constraints();
 		bool validate_ktx2_constraints();
-		void get_dfd(uint8_vec& dfd, const basist::ktx2_header& hdr);
+		bool get_dfd(uint8_vec& dfd, const basist::ktx2_header& hdr);
 		bool create_ktx2_file();
+		void pick_format_mode();
+
+		uint32_t get_block_width() const
+		{
+			if (m_params.m_hdr)
+			{
+				switch (m_params.m_hdr_mode)
+				{
+				case hdr_modes::cASTC_HDR_6X6:
+				case hdr_modes::cASTC_HDR_6X6_INTERMEDIATE:
+					return 6;
+				default:
+					break;
+				}
+			}
+			return 4;
+		}
+
+		uint32_t get_block_height() const
+		{
+			if (m_params.m_hdr)
+			{
+				switch (m_params.m_hdr_mode)
+				{
+				case hdr_modes::cASTC_HDR_6X6:
+				case hdr_modes::cASTC_HDR_6X6_INTERMEDIATE:
+					return 6;
+				default:
+					break;
+				}
+			}
+			return 4;
+		}
 	};
 				
 	// Alternative simple C-style wrapper API around the basis_compressor class. 
@@ -628,21 +772,19 @@ namespace basisu
 		cFlagKTX2UASTCSuperCompression = 1 << 12, // use KTX2 Zstd supercompression on UASTC files
 
 		cFlagSRGB = 1 << 13,			// input texture is sRGB, use perceptual colorspace metrics, also use sRGB filtering during mipmap gen, and also sets KTX2 output transfer func to sRGB
-		cFlagGenMipsClamp = 1 << 14,  // generate mipmaps with clamp addressing
-		cFlagGenMipsWrap = 1 << 15,  // generate mipmaps with wrap addressing
-		
-		cFlagYFlip = 1 << 16,		// flip source image on Y axis before compression
+		cFlagGenMipsClamp = 1 << 14,	// generate mipmaps with clamp addressing
+		cFlagGenMipsWrap = 1 << 15,		// generate mipmaps with wrap addressing
 		
-		cFlagUASTC = 1 << 17,		// use UASTC compression vs. ETC1S
-		cFlagUASTCRDO = 1 << 18,		// use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar)
+		cFlagYFlip = 1 << 16,			// flip source image on Y axis before compression
 		
-		cFlagPrintStats = 1 << 19,	// print image stats to stdout
-		cFlagPrintStatus = 1 << 20,	// print status to stdout
+		cFlagUASTCRDO = 1 << 17,		// use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar)
 		
-		cFlagHDR = 1 << 21,			// Force encoder into HDR mode, even if source image is LDR.
-		cFlagHDRLDRImageSRGBToLinearConversion = 1 << 22, // In HDR mode, convert LDR source images to linear before encoding.
+		cFlagPrintStats = 1 << 18,		// print image stats to stdout
+		cFlagPrintStatus = 1 << 19,		// print status to stdout
 		
-		cFlagDebugImages = 1 << 23	// enable status output
+		cFlagDebugImages = 1 << 20,		// enable status output
+
+		cFlagREC2020 = 1 << 21			// ASTC 6x6 modes: treat input as REC 2020 vs. the default 709
 	};
 
 	// This function accepts an array of source images. 
@@ -652,6 +794,7 @@ namespace basisu
 	// basisu_encoder_init() MUST be called first!
 	// LDR version. To compress the LDR source image as HDR: Use the cFlagHDR flag.
 	void* basis_compress(
+		basist::basis_tex_format mode,
 		const basisu::vector<image> &source_images,
 		uint32_t flags_and_quality, float uastc_rdo_quality,
 		size_t* pSize,
@@ -660,14 +803,16 @@ namespace basisu
 	// HDR-only version.
 	// Important: The returned block MUST be manually freed using basis_free_data().
 	void* basis_compress(
+		basist::basis_tex_format mode,
 		const basisu::vector<imagef>& source_images_hdr,
-		uint32_t flags_and_quality, 
+		uint32_t flags_and_quality, float lambda,
 		size_t* pSize,
 		image_stats* pStats = nullptr);
 
 	// This function only accepts a single LDR source image. It's just a wrapper for basis_compress() above.
 	// Important: The returned block MUST be manually freed using basis_free_data().
 	void* basis_compress(
+		basist::basis_tex_format mode,
 		const uint8_t* pImageRGBA, uint32_t width, uint32_t height, uint32_t pitch_in_pixels,
 		uint32_t flags_and_quality, float uastc_rdo_quality,
 		size_t* pSize,

+ 328 - 46
thirdparty/basis_universal/encoder/basisu_enc.cpp

@@ -21,7 +21,9 @@
 #include "jpgd.h"
 #include "pvpngreader.h"
 #include "basisu_opencl.h"
-#include "basisu_astc_hdr_enc.h"
+#include "basisu_uastc_hdr_4x4_enc.h"
+#include "basisu_astc_hdr_6x6_enc.h"
+
 #include <vector>
 
 #ifndef TINYEXR_USE_ZFP
@@ -47,10 +49,13 @@ namespace basisu
 {
 	uint64_t interval_timer::g_init_ticks, interval_timer::g_freq;
 	double interval_timer::g_timer_freq;
+
 #if BASISU_SUPPORT_SSE
 	bool g_cpu_supports_sse41;
 #endif
 
+	fast_linear_to_srgb g_fast_linear_to_srgb;
+
 	uint8_t g_hamming_dist[256] =
 	{
 		0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
@@ -175,7 +180,7 @@ namespace basisu
 
 	bool g_library_initialized;
 	std::mutex g_encoder_init_mutex;
-		
+				
 	// Encoder library initialization (just call once at startup)
 	bool basisu_encoder_init(bool use_opencl, bool opencl_force_serialization)
 	{
@@ -185,7 +190,7 @@ namespace basisu
 			return true;
 
 		detect_sse41();
-
+				
 		basist::basisu_transcoder_init();
 		pack_etc1_solid_color_init();
 		//uastc_init();
@@ -201,6 +206,7 @@ namespace basisu
 
 		astc_hdr_enc_init();
 		basist::bc6h_enc_init();
+		astc_6x6_hdr::global_init();
 
 		g_library_initialized = true;
 		return true;
@@ -215,15 +221,40 @@ namespace basisu
 
 	void error_vprintf(const char* pFmt, va_list args)
 	{
-		char buf[8192];
+		const uint32_t BUF_SIZE = 256;
+		char buf[BUF_SIZE];
 
-#ifdef _WIN32		
-		vsprintf_s(buf, sizeof(buf), pFmt, args);
-#else
-		vsnprintf(buf, sizeof(buf), pFmt, args);
-#endif
+		va_list args_copy;
+		va_copy(args_copy, args);
+		int total_chars = vsnprintf(buf, sizeof(buf), pFmt, args_copy);
+		va_end(args_copy);
 
-		fprintf(stderr, "ERROR: %s", buf);
+		if (total_chars < 0)
+		{
+			assert(0);
+			return;
+		}
+
+		if (total_chars >= (int)BUF_SIZE)
+		{
+			basisu::vector<char> var_buf(total_chars + 1);
+			
+			va_copy(args_copy, args);
+			int total_chars_retry = vsnprintf(var_buf.data(), var_buf.size(), pFmt, args_copy);
+			va_end(args_copy);
+
+			if (total_chars_retry < 0)
+			{
+				assert(0);
+				return;
+			}
+
+			fprintf(stderr, "ERROR: %s", var_buf.data());
+		}
+		else
+		{
+			fprintf(stderr, "ERROR: %s", buf);
+		}
 	}
 
 	void error_printf(const char *pFmt, ...)
@@ -234,6 +265,18 @@ namespace basisu
 		va_end(args);
 	}
 
+#if defined(_WIN32)
+	void platform_sleep(uint32_t ms)
+	{
+		Sleep(ms);
+	}
+#else
+	void platform_sleep(uint32_t ms)
+	{
+		// TODO
+	}
+#endif
+
 #if defined(_WIN32)
 	inline void query_counter(timer_ticks* pTicks)
 	{
@@ -331,6 +374,8 @@ namespace basisu
 		return ticks * g_timer_freq;
 	}
 
+	// Note this is linear<->sRGB, NOT REC709 which uses slightly different equations/transfer functions. 
+	// However the gamuts/white points of REC709 and sRGB are the same.
 	float linear_to_srgb(float l)
 	{
 		assert(l >= 0.0f && l <= 1.0f);
@@ -339,7 +384,7 @@ namespace basisu
 		else
 			return saturate(1.055f * powf(l, 1.0f / 2.4f) - .055f);
 	}
-
+		
 	float srgb_to_linear(float s)
 	{
 		assert(s >= 0.0f && s <= 1.0f);
@@ -418,7 +463,8 @@ namespace basisu
 
 		uint32_t width = 0, height = 0, num_chans = 0;
 		void* pImage = pv_png::load_png(pBuf, buf_size, 4, width, height, num_chans);
-		if (!pBuf)
+
+		if (!pImage)
 		{
 			error_printf("pv_png::load_png failed while loading image \"%s\"\n", pFilename);
 			return false;
@@ -457,6 +503,26 @@ namespace basisu
 		return true;
 	}
 
+	bool load_jpg(const uint8_t* pBuf, size_t buf_size, image& img)
+	{
+		if (buf_size > INT_MAX)
+		{
+			assert(0);
+			return false;
+		}
+
+		int width = 0, height = 0, actual_comps = 0;
+		uint8_t* pImage_data = jpgd::decompress_jpeg_image_from_memory(pBuf, (int)buf_size, &width, &height, &actual_comps, 4, jpgd::jpeg_decoder::cFlagBoxChromaFiltering);
+		if (!pImage_data)
+			return false;
+
+		img.init(pImage_data, width, height, 4);
+
+		free(pImage_data);
+
+		return true;
+	}
+
 	bool load_image(const char* pFilename, image& img)
 	{
 		std::string ext(string_get_extension(std::string(pFilename)));
@@ -478,7 +544,7 @@ namespace basisu
 		return false;
 	}
 
-	static void convert_ldr_to_hdr_image(imagef &img, const image &ldr_img, bool ldr_srgb_to_linear)
+	static void convert_ldr_to_hdr_image(imagef &img, const image &ldr_img, bool ldr_srgb_to_linear, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f)
 	{
 		img.resize(ldr_img.get_width(), ldr_img.get_height());
 
@@ -491,23 +557,41 @@ namespace basisu
 				vec4F& d = img(x, y);
 				if (ldr_srgb_to_linear)
 				{
-					// TODO: Multiply by 100-200 nits?
-					d[0] = srgb_to_linear(c[0] * (1.0f / 255.0f));
-					d[1] = srgb_to_linear(c[1] * (1.0f / 255.0f));
-					d[2] = srgb_to_linear(c[2] * (1.0f / 255.0f));
+					float r = (float)c[0];
+					float g = (float)c[1];
+					float b = (float)c[2];
+
+					if (ldr_black_bias > 0.0f)
+					{
+						// ASTC HDR is noticeably weaker dealing with blocks containing some pixels with components set to 0.
+						// Add a very slight bias less than .5 to avoid this difficulity. When the HDR image is mapped to SDR sRGB and rounded back to 8-bits, this bias will still result in zero.
+						// (FWIW, in reality, a physical monitor would be unlikely to have a perfectly zero black level.)
+						// This is purely optional and on most images it doesn't matter visually.
+						if (r == 0.0f)
+							r = ldr_black_bias;
+						if (g == 0.0f)
+							g = ldr_black_bias;
+						if (b == 0.0f)
+							b = ldr_black_bias;
+					}
+
+					// Compute how much linear light would be emitted by a SDR 80-100 nit monitor.
+					d[0] = srgb_to_linear(r * (1.0f / 255.0f)) * linear_nit_multiplier;
+					d[1] = srgb_to_linear(g * (1.0f / 255.0f)) * linear_nit_multiplier;
+					d[2] = srgb_to_linear(b * (1.0f / 255.0f)) * linear_nit_multiplier;
 				}
 				else
 				{
-					d[0] = c[0] * (1.0f / 255.0f);
-					d[1] = c[1] * (1.0f / 255.0f);
-					d[2] = c[2] * (1.0f / 255.0f);
+					d[0] = c[0] * (1.0f / 255.0f) * linear_nit_multiplier;
+					d[1] = c[1] * (1.0f / 255.0f) * linear_nit_multiplier;
+					d[2] = c[2] * (1.0f / 255.0f) * linear_nit_multiplier;
 				}
 				d[3] = c[3] * (1.0f / 255.0f);
 			}
 		}
 	}
 
-	bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear)
+	bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear, float linear_nit_multiplier, float ldr_black_bias)
 	{
 		if ((!pMem) || (!mem_size))
 		{
@@ -571,13 +655,22 @@ namespace basisu
 
 			break;
 		}
+		case hdr_image_type::cHITJPGImage:
+		{
+			image ldr_img;
+			if (!load_jpg(static_cast<const uint8_t*>(pMem), mem_size, ldr_img))
+				return false;
+
+			convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias);
+			break;
+		}
 		case hdr_image_type::cHITPNGImage:
 		{
 			image ldr_img;
 			if (!load_png(static_cast<const uint8_t *>(pMem), mem_size, ldr_img))
 				return false;
 
-			convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear);
+			convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias);
 			break;
 		}
 		case hdr_image_type::cHITEXRImage:
@@ -605,8 +698,21 @@ namespace basisu
 
 		return true;
 	}
+
+	bool is_image_filename_hdr(const char *pFilename)
+	{
+		std::string ext(string_get_extension(std::string(pFilename)));
+
+		if (ext.length() == 0)
+			return false;
+
+		const char* pExt = ext.c_str();
+
+		return ((strcasecmp(pExt, "hdr") == 0) || (strcasecmp(pExt, "exr") == 0));
+	}
 	
-	bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear)
+	// TODO: move parameters to struct, add a HDR clean flag to eliminate NaN's/Inf's
+	bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear, float linear_nit_multiplier, float ldr_black_bias)
 	{
 		std::string ext(string_get_extension(std::string(pFilename)));
 
@@ -637,7 +743,7 @@ namespace basisu
 			if (!load_image(pFilename, ldr_img))
 				return false;
 
-			convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear);
+			convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias);
 		}
 
 		return true;
@@ -1002,7 +1108,7 @@ namespace basisu
 			return false;
 		}
 
-		if ((src_w == dst_w) && (src_h == dst_h))
+		if ((src_w == dst_w) && (src_h == dst_h) && (filter_scale == 1.0f))
 		{
 			dst = src;
 			return true;
@@ -1652,7 +1758,7 @@ namespace basisu
 
 		uint32_t a = max_index / num_syms, b = max_index % num_syms;
 
-		const uint32_t ofs = m_entries_picked.size();
+		const size_t ofs = m_entries_picked.size();
 
 		m_entries_picked.push_back(a);
 		m_entries_picked.push_back(b);
@@ -2002,6 +2108,34 @@ namespace basisu
 		m_psnr = m_rms ? (float)clamp<double>(log10(255.0 / m_rms) * 20.0f, 0.0f, 100.0f) : 100.0f;
 	}
 
+	void print_image_metrics(const image& a, const image& b)
+	{
+		image_metrics im;
+		im.calc(a, b, 0, 3);
+		im.print("RGB    ");
+
+		im.calc(a, b, 0, 4);
+		im.print("RGBA   ");
+
+		im.calc(a, b, 0, 1);
+		im.print("R      ");
+
+		im.calc(a, b, 1, 1);
+		im.print("G      ");
+
+		im.calc(a, b, 2, 1);
+		im.print("B      ");
+
+		im.calc(a, b, 3, 1);
+		im.print("A      ");
+
+		im.calc(a, b, 0, 0);
+		im.print("Y 709  ");
+
+		im.calc(a, b, 0, 0, true, true);
+		im.print("Y 601  ");
+	}
+
 	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed)
 	{
 		rand r(seed);
@@ -2079,9 +2213,11 @@ namespace basisu
 	}
 
 	job_pool::job_pool(uint32_t num_threads) : 
-		m_num_active_jobs(0),
-		m_kill_flag(false)
+		m_num_active_jobs(0)
 	{
+		m_kill_flag.store(false);
+		m_num_active_workers.store(0);
+
 		assert(num_threads >= 1U);
 
 		debug_printf("job_pool::job_pool: %u total threads\n", num_threads);
@@ -2100,11 +2236,23 @@ namespace basisu
 		debug_printf("job_pool::~job_pool\n");
 		
 		// Notify all workers that they need to die right now.
-		m_kill_flag = true;
+		m_kill_flag.store(true);
 		
 		m_has_work.notify_all();
 
-		// Wait for all workers to die.
+#ifdef __EMSCRIPTEN__
+		for ( ; ; )
+		{
+			if (m_num_active_workers.load() <= 0)
+				break;
+			std::this_thread::sleep_for(std::chrono::milliseconds(50));
+		}
+		
+		// At this point all worker threads should be exiting or exited.
+		// We could call detach(), but this seems to just call join() anyway.
+#endif
+
+		// Wait for all worker threads to exit.
 		for (uint32_t i = 0; i < m_threads.size(); i++)
 			m_threads[i].join();
 	}
@@ -2157,13 +2305,26 @@ namespace basisu
 		}
 
 		// The queue is empty, now wait for all active jobs to finish up.
+#ifndef __EMSCRIPTEN__
 		m_no_more_jobs.wait(lock, [this]{ return !m_num_active_jobs; } );
+#else
+		// Avoid infinite blocking
+		for (; ; )
+		{
+			if (m_no_more_jobs.wait_for(lock, std::chrono::milliseconds(50), [this] { return !m_num_active_jobs; }))
+			{
+				break;
+			}
+		}
+#endif
 	}
 
 	void job_pool::job_thread(uint32_t index)
 	{
 		BASISU_NOTE_UNUSED(index);
 		//debug_printf("job_pool::job_thread: starting %u\n", index);
+
+		m_num_active_workers.fetch_add(1);
 		
 		while (true)
 		{
@@ -2199,6 +2360,8 @@ namespace basisu
 				m_no_more_jobs.notify_all();
 		}
 
+		m_num_active_workers.fetch_add(-1);
+
 		//debug_printf("job_pool::job_thread: exiting\n");
 	}
 
@@ -3314,7 +3477,7 @@ namespace basisu
 		return true;
 	}
 
-	bool write_exr(const char* pFilename, imagef& img, uint32_t n_chans, uint32_t flags)
+	bool write_exr(const char* pFilename, const imagef& img, uint32_t n_chans, uint32_t flags)
 	{
 		assert((n_chans == 1) || (n_chans == 3) || (n_chans == 4));
 
@@ -3483,11 +3646,14 @@ namespace basisu
 	
 	// Very basic global Reinhard tone mapping, output converted to sRGB with no dithering, alpha is carried through unchanged. 
 	// Only used for debugging/development.
-	void tonemap_image_reinhard(image &ldr_img, const imagef &hdr_img, float exposure)
+	void tonemap_image_reinhard(image &ldr_img, const imagef &hdr_img, float exposure, bool add_noise, bool per_component, bool luma_scaling)
 	{
 		uint32_t width = hdr_img.get_width(), height = hdr_img.get_height();
 
 		ldr_img.resize(width, height);
+
+		rand r;
+		r.seed(128);
 				
 		for (uint32_t y = 0; y < height; y++)
 		{
@@ -3495,32 +3661,84 @@ namespace basisu
 			{
 				vec4F c(hdr_img(x, y));
 
-				for (uint32_t t = 0; t < 3; t++)
+				if (per_component)
 				{
-					if (c[t] <= 0.0f)
+					for (uint32_t t = 0; t < 3; t++)
 					{
-						c[t] = 0.0f;
+						if (c[t] <= 0.0f)
+						{
+							c[t] = 0.0f;
+						}
+						else
+						{
+							c[t] *= exposure;
+							c[t] = c[t] / (1.0f + c[t]);
+						}
 					}
-					else
+				}
+				else
+				{
+					c[0] *= exposure;
+					c[1] *= exposure;
+					c[2] *= exposure;
+
+					const float L = 0.2126f * c[0] + 0.7152f * c[1] + 0.0722f * c[2];
+
+					float Lmapped = 0.0f;
+					if (L > 0.0f)
 					{
-						c[t] *= exposure;
-						c[t] = c[t] / (1.0f + c[t]);
+						//Lmapped = L / (1.0f + L);
+						//Lmapped /= L;
+						
+						Lmapped = 1.0f / (1.0f + L);
+					}
+
+					c[0] = c[0] * Lmapped;
+					c[1] = c[1] * Lmapped;
+					c[2] = c[2] * Lmapped;
+
+					if (luma_scaling)
+					{
+						// Keeps the ratio of r/g/b intact
+						float m = maximum(c[0], c[1], c[2]);
+						if (m > 1.0f)
+						{
+							c /= m;
+						}
 					}
 				}
 
 				c.clamp(0.0f, 1.0f);
 
-				c[0] = linear_to_srgb(c[0]) * 255.0f;
-				c[1] = linear_to_srgb(c[1]) * 255.0f;
-				c[2] = linear_to_srgb(c[2]) * 255.0f;
 				c[3] = c[3] * 255.0f;
 
 				color_rgba& o = ldr_img(x, y);
-				
-				o[0] = (uint8_t)std::round(c[0]);
-				o[1] = (uint8_t)std::round(c[1]);
-				o[2] = (uint8_t)std::round(c[2]);
-				o[3] = (uint8_t)std::round(c[3]);
+
+				if (add_noise)
+				{
+					c[0] = linear_to_srgb(c[0]) * 255.0f;
+					c[1] = linear_to_srgb(c[1]) * 255.0f;
+					c[2] = linear_to_srgb(c[2]) * 255.0f;
+
+					const float NOISE_AMP = .5f;
+					c[0] += r.frand(-NOISE_AMP, NOISE_AMP);
+					c[1] += r.frand(-NOISE_AMP, NOISE_AMP);
+					c[2] += r.frand(-NOISE_AMP, NOISE_AMP);
+
+					c.clamp(0.0f, 255.0f);
+
+					o[0] = (uint8_t)fast_roundf_int(c[0]);
+					o[1] = (uint8_t)fast_roundf_int(c[1]);
+					o[2] = (uint8_t)fast_roundf_int(c[2]);
+					o[3] = (uint8_t)fast_roundf_int(c[3]);
+				}
+				else
+				{
+					o[0] = g_fast_linear_to_srgb.convert(c[0]);
+					o[1] = g_fast_linear_to_srgb.convert(c[1]);
+					o[2] = g_fast_linear_to_srgb.convert(c[2]);
+					o[3] = (uint8_t)fast_roundf_int(c[3]);
+				}
 			}
 		}
 	}
@@ -3681,5 +3899,69 @@ namespace basisu
 
 		return true;
 	}
+
+	bool tonemap_image_compressive2(image& dst_img, const imagef& hdr_test_img)
+	{
+		const uint32_t width = hdr_test_img.get_width();
+		const uint32_t height = hdr_test_img.get_height();
+
+		dst_img.resize(width, height);
+		dst_img.set_all(color_rgba(0, 0, 0, 255));
+
+		basisu::vector<basist::half_float> half_img(width * 3 * height);
+				
+		uint32_t low_h = UINT32_MAX, high_h = 0;
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				const vec4F& p = hdr_test_img(x, y);
+
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					float f = p[i];
+
+					if (std::isnan(f) || std::isinf(f))
+						f = 0.0f;
+					else if (f < 0.0f)
+						f = 0.0f;
+					else if (f > basist::MAX_HALF_FLOAT)
+						f = basist::MAX_HALF_FLOAT;
+
+					uint32_t h = basist::float_to_half(f);
+
+					low_h = minimum(low_h, h);
+					high_h = maximum(high_h, h);
+					
+					half_img[(x + y * width) * 3 + i] = (basist::half_float)h;
+
+				} // i
+			} // x
+		} // y
+
+		if (low_h == high_h)
+			return false;
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					basist::half_float h = half_img[(x + y * width) * 3 + i];
 					
+					float f = (float)(h - low_h) / (float)(high_h - low_h);
+
+					int iv = basisu::clamp<int>((int)std::round(f * 255.0f), 0, 255);
+
+					dst_img(x, y)[i] = (uint8_t)iv;
+
+				} // i
+			} // x
+		} // y
+
+		return true;
+	}
+							
 } // namespace basisu

+ 432 - 151
thirdparty/basis_universal/encoder/basisu_enc.h

@@ -64,8 +64,19 @@ namespace basisu
 	void error_vprintf(const char* pFmt, va_list args);
 	void error_printf(const char *pFmt, ...);
 	
-	// Helpers
+	template <typename... Args>
+	inline void fmt_error_printf(const char* pFmt, Args&&... args)
+	{
+		std::string res;
+		if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward<Args>(args))... }))
+			return;
+		error_printf("%s", res.c_str());
+	}
 
+	void platform_sleep(uint32_t ms);
+	
+	// Helpers
+		
 	inline uint8_t clamp255(int32_t i)
 	{
 		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
@@ -98,6 +109,17 @@ namespace basisu
 		return (uint8_t)((v + (v >> 8)) >> 8);
 	}
 
+	inline int fast_roundf_int(float x)
+	{
+		return (x >= 0.0f) ? (int)(x + 0.5f) : (int)(x - 0.5f);
+	}
+
+	inline int fast_floorf_int(float x)
+	{
+		int xi = (int)x;  // Truncate towards zero
+		return ((x < 0.0f) && (x != (float)xi)) ? (xi - 1) : xi;
+	}
+
 	inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
 	{
 		assert(codesize <= 64);
@@ -168,6 +190,15 @@ namespace basisu
 	}
 
 	bool string_begins_with(const std::string& str, const char* pPhrase);
+
+	// Case sensitive, returns -1 if can't find
+	inline int string_find_first(const std::string& str, const char* pPhrase)
+	{
+		size_t res = str.find(pPhrase, 0);
+		if (res == std::string::npos)
+			return -1;
+		return (int)res;
+	}
 				
 	// Hashing
 	
@@ -209,12 +240,23 @@ namespace basisu
 	template <typename Key>
 	struct bit_hasher
 	{
-		std::size_t operator()(const Key& k) const
+		inline std::size_t operator()(const Key& k) const
 		{
 			return hash_hsieh(reinterpret_cast<const uint8_t *>(&k), sizeof(k));
 		}
 	};
 
+	struct string_hasher
+	{
+		inline std::size_t operator()(const std::string& k) const
+		{
+			size_t l = k.size();
+			if (!l)
+				return 0;
+			return hash_hsieh(reinterpret_cast<const uint8_t*>(k.c_str()), l);
+		}
+	};
+
 	class running_stat
 	{
 	public:
@@ -297,7 +339,7 @@ namespace basisu
 	};
 
 	// Linear algebra
-
+			
 	template <uint32_t N, typename T>
 	class vec
 	{
@@ -318,7 +360,7 @@ namespace basisu
 		inline vec(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] = other.m_v[i]; }
 		template <uint32_t OtherN, typename OtherT> inline vec(const vec<OtherN, OtherT> &other) { set(other); }
 
-		inline T operator[](uint32_t i) const { assert(i < N); return m_v[i]; }
+		inline const T& operator[](uint32_t i) const { assert(i < N); return m_v[i]; }
 		inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; }
 
 		inline T getX() const { return m_v[0]; }
@@ -327,6 +369,7 @@ namespace basisu
 		inline T getW() const { static_assert(N >= 4, "N too small"); return m_v[3]; }
 
 		inline bool operator==(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false;	return true; }
+		inline bool operator!=(const vec& rhs) const { return !(*this == rhs); }
 		inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; }
 
 		inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; }
@@ -433,6 +476,8 @@ namespace basisu
 
 		inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; }
 
+		inline vec get_normalized() const { vec res(*this); res.normalize_in_place(); return res; }
+
 		inline vec &clamp(T l, T h)
 		{
 			for (uint32_t i = 0; i < N; i++)
@@ -440,6 +485,14 @@ namespace basisu
 			return *this;
 		}
 
+		static vec component_mul(const vec& a, const vec& b)
+		{
+			vec res;
+			for (uint32_t i = 0; i < N; i++)
+				res[i] = a[i] * b[i];
+			return res;
+		}
+
 		static vec component_min(const vec& a, const vec& b)
 		{
 			vec res;
@@ -455,6 +508,14 @@ namespace basisu
 				res[i] = maximum(a[i], b[i]);
 			return res;
 		}
+
+		static vec lerp(const vec& a, const vec& b, float s)
+		{
+			vec res;
+			for (uint32_t i = 0; i < N; i++)
+				res[i] = basisu::lerp(a[i], b[i], s);
+			return res;
+		}
 	};
 
 	typedef vec<4, double> vec4D;
@@ -462,12 +523,17 @@ namespace basisu
 	typedef vec<2, double> vec2D;
 	typedef vec<1, double> vec1D;
 
+	typedef vec<6, float> vec6F;
+	typedef vec<5, float> vec5F;
 	typedef vec<4, float> vec4F;
 	typedef vec<3, float> vec3F;
 	typedef vec<2, float> vec2F;
 	typedef vec<1, float> vec1F;
 
 	typedef vec<16, float> vec16F;
+
+	template<uint32_t N, typename T> struct bitwise_copyable< vec<N, T> > { enum { cFlag = true }; };
+	template<uint32_t N, typename T> struct bitwise_movable< vec<N, T> > { enum { cFlag = true }; };
 		
 	template <uint32_t Rows, uint32_t Cols, typename T>
 	class matrix
@@ -514,6 +580,9 @@ namespace basisu
 		}
 	};
 
+	template<uint32_t R, uint32_t C, typename T> struct bitwise_copyable< matrix<R, C, T> > { enum { cFlag = true }; };
+	template<uint32_t R, uint32_t C, typename T> struct bitwise_movable< matrix<R, C, T> > { enum { cFlag = true }; };
+
 	template<uint32_t N, typename VectorType>
 	inline VectorType compute_pca_from_covar(matrix<N, N, float> &cmatrix)
 	{
@@ -759,6 +828,8 @@ namespace basisu
 		
 		std::atomic<bool> m_kill_flag;
 
+		std::atomic<int> m_num_active_workers;
+
 		void job_thread(uint32_t index);
 	};
 
@@ -962,6 +1033,9 @@ namespace basisu
 		inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; } 
 		inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }
 
+		inline uint32_t get_bgra_uint32() const { return b | (g << 8) | (r << 16) | (a << 24); }
+		inline uint32_t get_rgba_uint32() const { return r | (g << 8) | (b << 16) | (a << 24); }
+
 		inline basist::color32 get_color32() const
 		{
 			return basist::color32(r, g, b, a);
@@ -1135,23 +1209,7 @@ namespace basisu
 
 		return true;
 	}
-
-	inline std::string string_format(const char* pFmt, ...)
-	{
-		char buf[2048];
-
-		va_list args;
-		va_start(args, pFmt);
-#ifdef _WIN32		
-		vsprintf_s(buf, sizeof(buf), pFmt, args);
-#else
-		vsnprintf(buf, sizeof(buf), pFmt, args);
-#endif		
-		va_end(args);
-
-		return std::string(buf);
-	}
-
+		
 	inline std::string string_tolower(const std::string& s)
 	{
 		std::string result(s);
@@ -1710,7 +1768,7 @@ namespace basisu
 				// This SSE function takes pointers to void types, so do some sanity checks.
 				assert(sizeof(TrainingVectorType) == sizeof(float) * 16);
 				assert(sizeof(training_vec_with_weight) == sizeof(std::pair<vec16F, uint64_t>));
-				update_covar_matrix_16x16_sse41(node.m_training_vecs.size(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix);
+				update_covar_matrix_16x16_sse41(node.m_training_vecs.size_u32(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix);
 #endif
 			}
 
@@ -2019,9 +2077,7 @@ namespace basisu
 
 		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
 		{
-#ifndef __EMSCRIPTEN__
 			pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {
-#endif
 
 				Quantizer& lq = quantizers[thread_iter];
 				uint_vec& cluster_indices = initial_codebook[thread_iter];
@@ -2062,15 +2118,11 @@ namespace basisu
 					}
 				}
 
-#ifndef __EMSCRIPTEN__
 			} );
-#endif
 
 		} // thread_iter
 
-#ifndef __EMSCRIPTEN__
 		pJob_pool->wait_for_all();
-#endif
 
 		uint32_t total_clusters = 0, total_parent_clusters = 0;
 
@@ -2353,6 +2405,48 @@ namespace basisu
 		{
 		}
 
+		bitwise_coder(const bitwise_coder& other) :
+			m_bytes(other.m_bytes),
+			m_bit_buffer(other.m_bit_buffer),
+			m_bit_buffer_size(other.m_bit_buffer_size),
+			m_total_bits(other.m_total_bits)			
+		{
+		}
+
+		bitwise_coder(bitwise_coder&& other) :
+			m_bytes(std::move(other.m_bytes)),
+			m_bit_buffer(other.m_bit_buffer),
+			m_bit_buffer_size(other.m_bit_buffer_size),
+			m_total_bits(other.m_total_bits)
+		{
+		}
+
+		bitwise_coder& operator= (const bitwise_coder& rhs)
+		{
+			if (this == &rhs)
+				return *this;
+
+			m_bytes = rhs.m_bytes;
+			m_bit_buffer = rhs.m_bit_buffer;
+			m_bit_buffer_size = rhs.m_bit_buffer_size;
+			m_total_bits = rhs.m_total_bits;
+
+			return *this;
+		}
+
+		bitwise_coder& operator= (bitwise_coder&& rhs)
+		{
+			if (this == &rhs)
+				return *this;
+
+			m_bytes = std::move(rhs.m_bytes);
+			m_bit_buffer = rhs.m_bit_buffer;
+			m_bit_buffer_size = rhs.m_bit_buffer_size;
+			m_total_bits = rhs.m_total_bits;
+
+			return *this;
+		}
+
 		inline void clear()
 		{
 			clear_vector(m_bytes);
@@ -2370,8 +2464,12 @@ namespace basisu
 		}
 
 		inline const uint8_vec &get_bytes() const { return m_bytes; }
+		inline uint8_vec& get_bytes() { return m_bytes; }
+
+		inline void reserve(uint32_t size) { m_bytes.reserve(size); }
 
 		inline uint64_t get_total_bits() const { return m_total_bits; }
+		inline uint32_t get_total_bits_u32() const { assert(m_total_bits <= UINT32_MAX); return static_cast<uint32_t>(m_total_bits); }
 		inline void clear_total_bits() { m_total_bits = 0; }
 
 		inline void init(uint32_t reserve_size = 1024)
@@ -2495,16 +2593,27 @@ namespace basisu
 		}
 
 		uint32_t emit_huffman_table(const huffman_encoding_table &tab);
+
+		void append(const bitwise_coder& other)
+		{
+			for (uint32_t i = 0; i < other.m_bytes.size(); i++)
+				put_bits(other.m_bytes[i], 8);
+		
+			if (other.m_bit_buffer_size)
+				put_bits(other.m_bit_buffer, other.m_bit_buffer_size);
+		}
 		
 	private:
 		uint8_vec m_bytes;
 		uint32_t m_bit_buffer, m_bit_buffer_size;
 		uint64_t m_total_bits;
 
-		void append_byte(uint8_t c)
+		inline void append_byte(uint8_t c)
 		{
-			m_bytes.resize(m_bytes.size() + 1);
-			m_bytes.back() = c;
+			//m_bytes.resize(m_bytes.size() + 1);
+			//m_bytes.back() = c;
+
+			m_bytes.push_back(c);
 		}
 
 		static void end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len);
@@ -2672,6 +2781,31 @@ namespace basisu
 			*this = other;
 		}
 
+		image(image&& other) :
+			m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch),
+			m_pixels(std::move(other.m_pixels))
+		{
+			other.m_width = 0;
+			other.m_height = 0;
+			other.m_pitch = 0;
+		}
+
+		image& operator= (image&& rhs)
+		{
+			if (this != &rhs)
+			{
+				m_width = rhs.m_width;
+				m_height = rhs.m_height;
+				m_pitch = rhs.m_pitch;
+				m_pixels = std::move(rhs.m_pixels);
+
+				rhs.m_width = 0;
+				rhs.m_height = 0;
+				rhs.m_pitch = 0;
+			}
+			return *this;
+		}
+
 		image &swap(image &other)
 		{
 			std::swap(m_width, other.m_width);
@@ -2702,6 +2836,12 @@ namespace basisu
 			return *this;
 		}
 
+		image& match_dimensions(const image& other)
+		{
+			resize(other.get_width(), other.get_height());
+			return *this;
+		}
+
 		image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color)
 		{
 			return crop(w, h, p, background);
@@ -2913,7 +3053,7 @@ namespace basisu
 					const int sx = src_x + x;
 					if (sx < 0)
 						continue;
-					else if (sx >= (int)src.get_height())
+					else if (sx >= (int)src.get_width())
 						break;
 
 					set_clipped(dst_x + x, dst_y + y, src(sx, sy));
@@ -2955,6 +3095,8 @@ namespace basisu
 			return *this;
 		}
 
+		inline bool is_valid() const { return m_width > 0; }
+
 		inline uint32_t get_width() const { return m_width; }
 		inline uint32_t get_height() const { return m_height; }
 		inline uint32_t get_pitch() const { return m_pitch; }
@@ -3038,8 +3180,56 @@ namespace basisu
 			return *this;
 		}
 
+		void swap_rb()
+		{
+			for (auto& v : m_pixels)
+				std::swap(v.r, v.b);
+		}
+
 		void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...);
 				
+		vec4F get_filtered_vec4F(float x, float y) const
+		{
+			x -= .5f;
+			y -= .5f;
+
+			int ix = (int)floorf(x);
+			int iy = (int)floorf(y);
+			float wx = x - ix;
+			float wy = y - iy;
+
+			color_rgba a(get_clamped(ix, iy));
+			color_rgba b(get_clamped(ix + 1, iy));
+			color_rgba c(get_clamped(ix, iy + 1));
+			color_rgba d(get_clamped(ix + 1, iy + 1));
+
+			vec4F result;
+
+			for (uint32_t i = 0; i < 4; i++)
+			{
+				const float top = lerp<float>((float)a[i], (float)b[i], wx);
+				const float bot = lerp<float>((float)c[i], (float)d[i], wx);
+				const float m = lerp<float>((float)top, (float)bot, wy);
+
+				result[i] = m;
+			}
+
+			return result;
+		}
+
+		// (x,y) - Continuous coordinates, where pixel centers are at (.5,.5), valid image coords are [0,width] and [0,height]. Clamp addressing.
+		color_rgba get_filtered(float x, float y) const
+		{
+			const vec4F fresult(get_filtered_vec4F(x, y));
+
+			color_rgba result;
+
+			for (uint32_t i = 0; i < 4; i++)
+				result[i] = (uint8_t)clamp<int>((int)(fresult[i] + .5f), 0, 255);
+
+			return result;
+		}
+				
 	private:
 		uint32_t m_width, m_height, m_pitch;  // all in pixels
 		color_rgba_vec m_pixels;
@@ -3069,6 +3259,31 @@ namespace basisu
 			*this = other;
 		}
 
+		imagef(imagef&& other) :
+			m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch),
+			m_pixels(std::move(other.m_pixels))
+		{
+			other.m_width = 0;
+			other.m_height = 0;
+			other.m_pitch = 0;
+		}
+
+		imagef& operator= (imagef&& rhs)
+		{
+			if (this != &rhs)
+			{
+				m_width = rhs.m_width;
+				m_height = rhs.m_height;
+				m_pitch = rhs.m_pitch;
+				m_pixels = std::move(rhs.m_pixels);
+
+				rhs.m_width = 0;
+				rhs.m_height = 0;
+				rhs.m_pitch = 0;
+			}
+			return *this;
+		}
+
 		imagef &swap(imagef &other)
 		{
 			std::swap(m_width, other.m_width);
@@ -3118,6 +3333,12 @@ namespace basisu
 			return *this;
 		}
 
+		imagef& match_dimensions(const imagef& other)
+		{
+			resize(other.get_width(), other.get_height());
+			return *this;
+		}
+
 		imagef &resize(const imagef &other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))
 		{
 			return resize(other.get_width(), other.get_height(), p, background);
@@ -3248,7 +3469,7 @@ namespace basisu
 					const int sx = src_x + x;
 					if (sx < 0)
 						continue;
-					else if (sx >= (int)src.get_height())
+					else if (sx >= (int)src.get_width())
 						break;
 
 					set_clipped(dst_x + x, dst_y + y, src(sx, sy));
@@ -3274,10 +3495,12 @@ namespace basisu
 			return *this;
 		}
 
+		inline bool is_valid() const { return m_width > 0; }
+
 		inline uint32_t get_width() const { return m_width; }
 		inline uint32_t get_height() const { return m_height; }
 		inline uint32_t get_pitch() const { return m_pitch; }
-		inline uint32_t get_total_pixels() const { return m_width * m_height; }
+		inline uint64_t get_total_pixels() const { return (uint64_t)m_width * m_height; }
 
 		inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }
 		inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }
@@ -3315,7 +3538,7 @@ namespace basisu
 							{
 								if (!nan_msg)
 								{
-									fprintf(stderr, "One or more pixels was NaN, setting to 0.\n");
+									fprintf(stderr, "One or more input pixels was NaN, setting to 0.\n");
 									nan_msg = true;
 								}
 							}
@@ -3324,7 +3547,7 @@ namespace basisu
 							{
 								if (!inf_msg)
 								{
-									fprintf(stderr, "One or more pixels was INF, setting to 0.\n");
+									fprintf(stderr, "One or more input pixels was INF, setting to 0.\n");
 									inf_msg = true;
 								}
 							}
@@ -3333,7 +3556,7 @@ namespace basisu
 							{
 								if (!neg_zero_msg)
 								{
-									fprintf(stderr, "One or more pixels was -0, setting them to 0.\n");
+									fprintf(stderr, "One or more input pixels was -0, setting them to 0.\n");
 									neg_zero_msg = true;
 								}
 							}
@@ -3350,7 +3573,7 @@ namespace basisu
 
 								if (!neg_msg)
 								{
-									fprintf(stderr, "One or more pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n");
+									fprintf(stderr, "One or more input pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n");
 									neg_msg = true;
 								}
 								
@@ -3363,7 +3586,7 @@ namespace basisu
 								
 								if (!clamp_msg)
 								{
-									fprintf(stderr, "One or more pixels had to be clamped to %f.\n", highest_mag);
+									fprintf(stderr, "One or more input pixels had to be clamped to %f.\n", highest_mag);
 									clamp_msg = true;
 								}
 
@@ -3385,6 +3608,45 @@ namespace basisu
 
 			return *this;
 		}
+
+		bool has_alpha(uint32_t channel = 3) const
+		{
+			for (uint32_t y = 0; y < m_height; ++y)
+				for (uint32_t x = 0; x < m_width; ++x)
+					if ((*this)(x, y)[channel] != 1.0f)
+						return true;
+
+			return false;
+		}
+
+		vec4F get_filtered_vec4F(float x, float y) const
+		{
+			x -= .5f;
+			y -= .5f;
+
+			int ix = (int)floorf(x);
+			int iy = (int)floorf(y);
+			float wx = x - ix;
+			float wy = y - iy;
+
+			vec4F a(get_clamped(ix, iy));
+			vec4F b(get_clamped(ix + 1, iy));
+			vec4F c(get_clamped(ix, iy + 1));
+			vec4F d(get_clamped(ix + 1, iy + 1));
+
+			vec4F result;
+
+			for (uint32_t i = 0; i < 4; i++)
+			{
+				const float top = lerp<float>((float)a[i], (float)b[i], wx);
+				const float bot = lerp<float>((float)c[i], (float)d[i], wx);
+				const float m = lerp<float>((float)top, (float)bot, wy);
+
+				result[i] = m;
+			}
+
+			return result;
+		}
 						
 	private:
 		uint32_t m_width, m_height, m_pitch;  // all in pixels
@@ -3402,6 +3664,52 @@ namespace basisu
 	float linear_to_srgb(float l);
 	float srgb_to_linear(float s);
 
+	class fast_linear_to_srgb
+	{
+	public:
+		fast_linear_to_srgb()
+		{
+			init();
+		}
+
+		void init()
+		{
+			for (int i = 0; i < LINEAR_TO_SRGB_TABLE_SIZE; ++i)
+			{
+				float l = (float)i * (1.0f / (LINEAR_TO_SRGB_TABLE_SIZE - 1));
+				m_linear_to_srgb_table[i] = (uint8_t)basisu::fast_floorf_int(255.0f * basisu::linear_to_srgb(l));
+			}
+
+			float srgb_to_linear[256];
+			for (int i = 0; i < 256; i++)
+				srgb_to_linear[i] = basisu::srgb_to_linear((float)i / 255.0f);
+
+			for (int i = 0; i < 256; i++)
+				m_srgb_to_linear_thresh[i] = (srgb_to_linear[i] + srgb_to_linear[basisu::minimum<int>(i + 1, 255)]) * .5f;
+		}
+
+		inline uint8_t convert(float l) const
+		{
+			assert((l >= 0.0f) && (l <= 1.0f));
+			int j = basisu::fast_roundf_int((LINEAR_TO_SRGB_TABLE_SIZE - 1) * l);
+
+			assert((j >= 0) && (j < LINEAR_TO_SRGB_TABLE_SIZE));
+			int b = m_linear_to_srgb_table[j];
+
+			b += (l > m_srgb_to_linear_thresh[b]);
+
+			return (uint8_t)b;
+		}
+
+	private:
+		static constexpr int LINEAR_TO_SRGB_TABLE_SIZE = 2048;
+		uint8_t m_linear_to_srgb_table[LINEAR_TO_SRGB_TABLE_SIZE];
+
+		float m_srgb_to_linear_thresh[256];
+	};
+
+	extern fast_linear_to_srgb g_fast_linear_to_srgb;
+		
 	// Image metrics
 		
 	class image_metrics
@@ -3438,6 +3746,8 @@ namespace basisu
 		void calc(const image &a, const image &b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool use_601_luma = false);
 	};
 
+	void print_image_metrics(const image& a, const image& b);
+
 	// Image saving/loading/resampling
 
 	bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);
@@ -3450,15 +3760,22 @@ namespace basisu
 	bool load_qoi(const char* pFilename, image& img);
 
 	bool load_jpg(const char *pFilename, image& img);
+	bool load_jpg(const uint8_t* pBuf, size_t buf_size, image& img);
 	inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
 	
 	// Currently loads .PNG, .TGA, or .JPG
 	bool load_image(const char* pFilename, image& img);
 	inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
 
+	bool is_image_filename_hdr(const char* pFilename);
+
 	// Supports .HDR and most (but not all) .EXR's (see TinyEXR).
-	bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true);
-	inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true) { return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear); }
+	bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f);
+	
+	inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f)
+	{ 
+		return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias);
+	}
 
 	enum class hdr_image_type
 	{
@@ -3466,10 +3783,11 @@ namespace basisu
 		cHITRGBAFloat = 1,
 		cHITPNGImage = 2,
 		cHITEXRImage = 3,
-		cHITHDRImage = 4
+		cHITHDRImage = 4,
+		cHITJPGImage = 5
 	};
 
-	bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear);
+	bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f);
 
 	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);
 	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);
@@ -3512,7 +3830,7 @@ namespace basisu
 	};
 
 	// Supports 1 (Y), 3 (RGB), or 4 (RGBA) channel images.
-	bool write_exr(const char* pFilename, imagef& img, uint32_t n_chans, uint32_t flags);
+	bool write_exr(const char* pFilename, const imagef& img, uint32_t n_chans, uint32_t flags);
 			
 	enum
 	{
@@ -3572,102 +3890,6 @@ namespace basisu
 
 	inline double get_interval_timer() { return interval_timer::ticks_to_secs(interval_timer::get_ticks()); }
 
-	// 2D array
-
-	template<typename T>
-	class vector2D
-	{
-		typedef basisu::vector<T> TVec;
-
-		uint32_t m_width, m_height;
-		TVec m_values;
-
-	public:
-		vector2D() :
-			m_width(0),
-			m_height(0)
-		{
-		}
-
-		vector2D(uint32_t w, uint32_t h) :
-			m_width(0),
-			m_height(0)
-		{
-			resize(w, h);
-		}
-
-		vector2D(const vector2D &other)
-		{
-			*this = other;
-		}
-
-		vector2D &operator= (const vector2D &other)
-		{
-			if (this != &other)
-			{
-				m_width = other.m_width;
-				m_height = other.m_height;
-				m_values = other.m_values;
-			}
-			return *this;
-		}
-
-		inline bool operator== (const vector2D &rhs) const
-		{
-			return (m_width == rhs.m_width) && (m_height == rhs.m_height) && (m_values == rhs.m_values);
-		}
-
-		inline uint32_t size_in_bytes() const { return (uint32_t)m_values.size() * sizeof(m_values[0]); }
-
-		inline const T &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; }
-		inline T &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; }
-
-		inline const T &operator[] (uint32_t i) const { return m_values[i]; }
-		inline T &operator[] (uint32_t i) { return m_values[i]; }
-				
-		inline const T &at_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }		
-		inline T &at_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
-
-		void clear()
-		{
-			m_width = 0;
-			m_height = 0;
-			m_values.clear();
-		}
-
-		void set_all(const T&val)
-		{
-			vector_set_all(m_values, val);
-		}
-
-		inline const T* get_ptr() const { return &m_values[0]; }
-		inline T* get_ptr() { return &m_values[0]; }
-
-		vector2D &resize(uint32_t new_width, uint32_t new_height)
-		{
-			if ((m_width == new_width) && (m_height == new_height))
-				return *this;
-
-			TVec oldVals(new_width * new_height);
-			oldVals.swap(m_values);
-
-			const uint32_t w = minimum(m_width, new_width);
-			const uint32_t h = minimum(m_height, new_height);
-
-			if ((w) && (h))
-			{
-				for (uint32_t y = 0; y < h; y++)
-					for (uint32_t x = 0; x < w; x++)
-						m_values[x + y * new_width] = oldVals[x + y * m_width];
-			}
-
-			m_width = new_width;
-			m_height = new_height;
-
-			return *this;
-		}
-	};
-
 	inline FILE *fopen_safe(const char *pFilename, const char *pMode)
 	{
 #ifdef _WIN32
@@ -3723,12 +3945,14 @@ namespace basisu
 	};
 	typedef basisu::vector<pixel_block_hdr> pixel_block_hdr_vec;
 
-	void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure);
+	void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure, bool add_noise = false, bool per_component = true, bool luma_scaling = false);
 	bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img);
+	bool tonemap_image_compressive2(image& dst_img, const imagef& hdr_test_img);
 	
 	// Intersection
 	enum eClear { cClear = 0 };
 	enum eInitExpand { cInitExpand = 0 };
+	enum eIdentity { cIdentity = 0 };
 
 	template<typename vector_type>
 	class ray
@@ -3845,6 +4069,7 @@ namespace basisu
 	typedef vec_interval<vec3F> vec_interval3F;
 	typedef vec_interval<vec4F> vec_interval4F;
 
+	typedef vec_interval1F aabb1F;
 	typedef vec_interval2F aabb2F;
 	typedef vec_interval3F aabb3F;
 
@@ -4004,18 +4229,19 @@ namespace basisu
 		return result;
 	}
 
+	union fu32
+	{
+		uint32_t u;
+		float f;
+	};
+
 	// Supports positive and denormals only. No NaN or Inf.
-	inline float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h)
+	BASISU_FORCE_INLINE float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h)
 	{
 		assert(!basist::half_is_signed(h) && !basist::is_half_inf_or_nan(h));
-
-		union fu32
-		{
-			uint32_t u;
-			float f;
-		};
-
-		static const fu32 K = { 0x77800000 };
+				
+		// add 112 to the exponent (112+half float's exp bias of 15=float32's bias of 127)
+		static const fu32 K = { 0x77800000 }; 
 
 		fu32 o;
 		o.u = h << 13;
@@ -4023,7 +4249,62 @@ namespace basisu
 
 		return o.f;
 	}
+
+	// Positive, negative, or denormals. No NaN or Inf. Clamped to MAX_HALF_FLOAT.
+	inline basist::half_float fast_float_to_half_trunc_no_nan_or_inf(float f)
+	{
+		assert(!isnan(f) && !isinf(f));
+
+		// Sutract 112 from the exponent, to change the bias from 127 to 15.
+		static const fu32 g_f_to_h{ 0x7800000 };
 				
-} // namespace basisu
+		fu32 fu;
+
+		fu.f = minimum<float>((float)basist::MAX_HALF_FLOAT, fabsf(f)) * g_f_to_h.f;
 
+		return (basist::half_float)(((fu.u >> (23 - 10)) & 0x7FFF) | ((f < 0.0f) ? 0x8000 : 0));
+	}
+
+	inline basist::half_float fast_float_to_half_trunc_no_clamp_neg_nan_or_inf(float f)
+	{
+		assert(!isnan(f) && !isinf(f));
+		assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT));
+		
+		// Sutract 112 from the exponent, to change the bias from 127 to 15.
+		static const fu32 g_f_to_h{ 0x7800000 };
+
+		fu32 fu;
+
+		fu.f = f * g_f_to_h.f;
+		
+		return (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF);
+	}
+		
+	inline basist::half_float fast_float_to_half_no_clamp_neg_nan_or_inf(float f)
+	{
+		assert(!isnan(f) && !isinf(f));
+		assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT));
+
+		// Sutract 112 from the exponent, to change the bias from 127 to 15.
+		static const fu32 g_f_to_h{ 0x7800000 };
+
+		fu32 fu;
+
+		fu.f = f * g_f_to_h.f;
+
+		uint32_t h = (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF);
+
+		// round to even or nearest
+		uint32_t mant = fu.u & 8191; // examine lowest 13 bits
+		uint32_t inc = (mant > 4096) | ((mant == 4096) & (h & 1));
+		h += inc;
+
+		if (h > basist::MAX_HALF_FLOAT_AS_INT_BITS)
+			h = basist::MAX_HALF_FLOAT_AS_INT_BITS;
+
+		return (basist::half_float)h;
+	}
+								
+} // namespace basisu
 
+#include "basisu_math.h"

+ 16 - 94
thirdparty/basis_universal/encoder/basisu_frontend.cpp

@@ -353,9 +353,7 @@ namespace basisu
 			const uint32_t first_index = block_index_iter;
 			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job([this, first_index, last_index] {
-#endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{
@@ -388,15 +386,11 @@ namespace basisu
 					m_block_selector_cluster_index[block_index] = best_index;
 				}
 
-#ifndef __EMSCRIPTEN__
 				});
-#endif
 
 		}
 
-#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
-#endif
 
 		m_encoded_blocks.resize(m_total_blocks);
 		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
@@ -425,9 +419,7 @@ namespace basisu
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-#ifndef __EMSCRIPTEN__
 				m_params.m_pJob_pool->add_job([this, first_index, last_index, pass] {
-#endif
 										
 					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 					{
@@ -475,15 +467,11 @@ namespace basisu
 
 					} // block_index
 
-#ifndef __EMSCRIPTEN__
 					});
-#endif
 
 			}
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
-#endif
 
 			m_endpoint_clusters.resize(0);
 			m_endpoint_clusters.resize(endpoints.size());
@@ -501,9 +489,7 @@ namespace basisu
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-#ifndef __EMSCRIPTEN__
 				m_params.m_pJob_pool->add_job([this, first_index, last_index] {
-#endif
 
 					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 					{
@@ -535,15 +521,11 @@ namespace basisu
 						m_block_selector_cluster_index[block_index] = best_index;
 					}
 
-#ifndef __EMSCRIPTEN__
 					});
-#endif
 
 			}
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
-#endif
 
 			m_encoded_blocks.resize(m_total_blocks);
 			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
@@ -573,7 +555,7 @@ namespace basisu
 		debug_printf("introduce_special_selector_clusters\n");
 
 		uint32_t total_blocks_relocated = 0;
-		const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
+		const uint32_t initial_selector_clusters = m_selector_cluster_block_indices.size_u32();
 
 		bool_vec block_relocated_flags(m_total_blocks);
 
@@ -595,7 +577,7 @@ namespace basisu
 
 			debug_printf("Introducing sel %u\n", sel);
 
-			const uint32_t new_selector_cluster_index = (uint32_t)m_optimized_cluster_selectors.size();
+			const uint32_t new_selector_cluster_index = m_optimized_cluster_selectors.size_u32();
 
 			m_optimized_cluster_selectors.push_back(blk);
 			
@@ -675,7 +657,7 @@ namespace basisu
 	{
 		debug_printf("optimize_selector_codebook\n");
 
-		const uint32_t orig_total_selector_clusters = (uint32_t)m_optimized_cluster_selectors.size();
+		const uint32_t orig_total_selector_clusters = m_optimized_cluster_selectors.size_u32();
 
 		bool_vec selector_cluster_was_used(m_optimized_cluster_selectors.size());
 		for (uint32_t i = 0; i < m_total_blocks; i++)
@@ -787,9 +769,7 @@ namespace basisu
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-#ifndef __EMSCRIPTEN__
 				m_params.m_pJob_pool->add_job([this, first_index, last_index] {
-#endif
 
 					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 					{
@@ -830,15 +810,11 @@ namespace basisu
 								blk.set_selector(x, y, selectors[x + y * 4]);
 					}
 
-#ifndef __EMSCRIPTEN__
 					});
-#endif
 
 			}
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
-#endif
 
 		} // use_cpu
 		 
@@ -859,9 +835,7 @@ namespace basisu
 			const uint32_t first_index = block_index_iter;
 			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {
-#endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{			
@@ -883,15 +857,11 @@ namespace basisu
 
 				} // block_index;
 
-#ifndef __EMSCRIPTEN__
 			} );
-#endif
 
 		} // block_index_iter
 
-#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
-#endif
 	}
 
 	void basisu_frontend::generate_endpoint_clusters()
@@ -970,7 +940,7 @@ namespace basisu
 		}
 								
 		if (m_params.m_debug_stats)
-			debug_printf("Total endpoint clusters: %u, parent clusters: %u\n", (uint32_t)m_endpoint_clusters.size(), (uint32_t)m_endpoint_parent_clusters.size());
+			debug_printf("Total endpoint clusters: %u, parent clusters: %u\n", m_endpoint_clusters.size_u32(), m_endpoint_parent_clusters.size_u32());
 	}
 
 	// Iterate through each array of endpoint cluster block indices and set the m_block_endpoint_clusters_indices[][] array to indicaste which cluster index each block uses.
@@ -1040,11 +1010,9 @@ namespace basisu
 		for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)
 		{
 			const uint32_t first_index = cluster_index_iter;                                    
-			const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);   
+			const uint32_t last_index = minimum<uint32_t>(m_endpoint_clusters.size_u32(), cluster_index_iter + N);   
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
-#endif
 
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
@@ -1112,15 +1080,11 @@ namespace basisu
 					}
 				} // cluster_index
 
-#ifndef __EMSCRIPTEN__
 			} );
-#endif
 
 		} // cluster_index_iter
 
-#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
-#endif
 
 		vector_sort(m_subblock_endpoint_quant_err_vec);
 	}
@@ -1131,19 +1095,19 @@ namespace basisu
 
 		generate_block_endpoint_clusters();
 
-		int num_new_endpoint_clusters = m_params.m_max_endpoint_clusters - (uint32_t)m_endpoint_clusters.size();
+		int num_new_endpoint_clusters = m_params.m_max_endpoint_clusters - m_endpoint_clusters.size_u32();
 		if (num_new_endpoint_clusters <= 0)
 			return;
 
 		compute_endpoint_subblock_error_vec();
 
-		const uint32_t num_orig_endpoint_clusters = (uint32_t)m_endpoint_clusters.size();
+		const uint32_t num_orig_endpoint_clusters = m_endpoint_clusters.size_u32();
 
 		std::unordered_set<uint32_t> training_vector_was_relocated;
 
 		uint_vec cluster_sizes(num_orig_endpoint_clusters);
 		for (uint32_t i = 0; i < num_orig_endpoint_clusters; i++)
-			cluster_sizes[i] = (uint32_t)m_endpoint_clusters[i].size();
+			cluster_sizes[i] = m_endpoint_clusters[i].size_u32();
 
 		std::unordered_set<uint32_t> ignore_cluster;
 
@@ -1259,7 +1223,7 @@ namespace basisu
 		// TODO: Get this working when step>0
 		if (m_params.m_pOpenCL_context && !step)
 		{
-			const uint32_t total_clusters = m_endpoint_clusters.size();
+			const uint32_t total_clusters = (uint32_t)m_endpoint_clusters.size();
 
 			basisu::vector<cl_pixel_cluster> pixel_clusters(total_clusters);
 			
@@ -1342,7 +1306,7 @@ namespace basisu
 
 				} // cluster_indices_iter
 
-				uint32_t* pSorted = radix_sort(colors.size(), colors.data(), colors2.data(), 0, 3);
+				uint32_t* pSorted = radix_sort((uint32_t)colors.size(), colors.data(), colors2.data(), 0, 3);
 
 				const uint64_t first_pixel_index = input_pixels.size();
 
@@ -1522,9 +1486,7 @@ namespace basisu
 				const uint32_t first_index = cluster_index_iter;
 				const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);
 
-#ifndef __EMSCRIPTEN__
 				m_params.m_pJob_pool->add_job([this, first_index, last_index, step] {
-#endif
 
 					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 					{
@@ -1643,15 +1605,11 @@ namespace basisu
 
 					} // cluster_index
 
-#ifndef __EMSCRIPTEN__
 					});
-#endif
 
 			} // cluster_index_iter
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
-#endif
 		}
 
 		debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
@@ -1726,14 +1684,14 @@ namespace basisu
 		{
 			// For the OpenCL kernel, we order the parent endpoint clusters by smallest to largest for efficiency.
 			// We also prepare an array of block info structs that point into this new parent endpoint cluster array.
-			const uint32_t total_parent_clusters = m_endpoint_clusters_within_each_parent_cluster.size();
+			const uint32_t total_parent_clusters = (uint32_t)m_endpoint_clusters_within_each_parent_cluster.size();
 
 			basisu::vector<cl_block_info_struct> cl_block_info_structs(m_total_blocks);
 			
 			// the size of each parent cluster, in total clusters
 			uint_vec parent_cluster_sizes(total_parent_clusters);
 			for (uint32_t i = 0; i < total_parent_clusters; i++)
-				parent_cluster_sizes[i] = m_endpoint_clusters_within_each_parent_cluster[i].size();
+				parent_cluster_sizes[i] = (uint32_t)m_endpoint_clusters_within_each_parent_cluster[i].size();
 
 			uint_vec first_parent_cluster_ofs(total_parent_clusters);
 			uint32_t cur_ofs = 0;
@@ -1818,9 +1776,7 @@ namespace basisu
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-#ifndef __EMSCRIPTEN__
 				m_params.m_pJob_pool->add_job([this, first_index, last_index, &best_cluster_indices, &block_clusters] {
-#endif
 
 					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 					{
@@ -1951,15 +1907,11 @@ namespace basisu
 
 					} // block_index
 
-#ifndef __EMSCRIPTEN__
 					});
-#endif
 
 			} // block_index_iter
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
-#endif
 		
 		} // use_cpu
 						
@@ -2104,9 +2056,7 @@ namespace basisu
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-#ifndef __EMSCRIPTEN__
 				m_params.m_pJob_pool->add_job([this, first_index, last_index] {
-#endif
 
 					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 					{
@@ -2131,15 +2081,11 @@ namespace basisu
 
 					} // block_index
 
-#ifndef __EMSCRIPTEN__
 					});
-#endif
 
 			} // block_index_iter
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
-#endif
 
 		} // use_cpu
 				
@@ -2204,9 +2150,7 @@ namespace basisu
 			const uint32_t first_index = block_index_iter;
 			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {
-#endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{
@@ -2233,15 +2177,11 @@ namespace basisu
 				
 				} // block_index
 
-#ifndef __EMSCRIPTEN__
 			} );
-#endif
 
 		} // block_index_iter
 
-#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
-#endif
 
 		vec16F_clusterizer selector_clusterizer;
 		for (uint32_t i = 0; i < m_total_blocks; i++)
@@ -2335,9 +2275,7 @@ namespace basisu
 			const uint32_t first_index = cluster_index_iter;
 			const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);
 
-#ifndef __EMSCRIPTEN__			
 			m_params.m_pJob_pool->add_job([this, first_index, last_index] {
-#endif
 
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
@@ -2406,15 +2344,11 @@ namespace basisu
 
 				} // cluster_index
 
-#ifndef __EMSCRIPTEN__
 				});
-#endif
 
 		} // cluster_index_iter
 
-#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
-#endif
 
 		debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
 				
@@ -2506,7 +2440,7 @@ namespace basisu
 
 		if ((m_params.m_pOpenCL_context) && m_use_hierarchical_selector_codebooks)
 		{
-			const uint32_t num_parent_clusters = m_selector_clusters_within_each_parent_cluster.size();
+			const uint32_t num_parent_clusters = m_selector_clusters_within_each_parent_cluster.size_u32();
 
 			basisu::vector<fosc_selector_struct> selector_structs;
 			selector_structs.reserve(m_optimized_cluster_selectors.size());
@@ -2534,7 +2468,7 @@ namespace basisu
 					selector_cluster_indices.push_back(selector_cluster_index);
 				}
 
-				cur_ofs += m_selector_clusters_within_each_parent_cluster[parent_index].size();
+				cur_ofs += m_selector_clusters_within_each_parent_cluster[parent_index].size_u32();
 			}
 
 			const uint32_t total_input_selectors = cur_ofs;
@@ -2549,7 +2483,7 @@ namespace basisu
 
 				block_structs[i].m_etc_color5_inten.a = (uint8_t)blk.get_inten_table(0);
 				block_structs[i].m_first_selector = parent_selector_cluster_offsets[parent_selector_cluster];
-				block_structs[i].m_num_selectors = m_selector_clusters_within_each_parent_cluster[parent_selector_cluster].size();
+				block_structs[i].m_num_selectors = m_selector_clusters_within_each_parent_cluster[parent_selector_cluster].size_u32();
 			}
 
 			uint_vec output_selector_cluster_indices(m_total_blocks);
@@ -2615,10 +2549,8 @@ namespace basisu
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-	#ifndef __EMSCRIPTEN__
 				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &unpacked_optimized_cluster_selectors] {
-	#endif
-
+	
 				int prev_best_cluster_index = 0;
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
@@ -2756,15 +2688,11 @@ namespace basisu
 					
 				} // block_index
 
-	#ifndef __EMSCRIPTEN__
 				} );
-	#endif
 
 			} // block_index_iter
 
-	#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
-	#endif
 						
 			for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
 			{
@@ -3081,9 +3009,7 @@ namespace basisu
 			const uint32_t first_index = cluster_index_iter;                                    
 			const uint32_t last_index = minimum<uint32_t>((uint32_t)new_endpoint_cluster_block_indices.size(), cluster_index_iter + N);   
 
-#ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &cluster_improved, &cluster_valid, &new_endpoint_cluster_block_indices, &pBlock_selector_indices ] {
-#endif
 
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
@@ -3173,15 +3099,11 @@ namespace basisu
 
 				} // cluster_index
 
-#ifndef __EMSCRIPTEN__
 			} );
-#endif
 
 		} // cluster_index_iter
 
-#ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
-#endif
 				
 		uint32_t total_unused_clusters = 0;
 		uint32_t total_improved_clusters = 0;

+ 74 - 5
thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp

@@ -19,6 +19,8 @@
 #include "basisu_bc7enc.h"
 #include "../transcoder/basisu_astc_hdr_core.h"
 
+#define BASISU_USE_GOOGLE_ASTC_DECODER (1)
+
 namespace basisu
 {
 	//------------------------------------------------------------------------------------------------
@@ -1421,6 +1423,7 @@ namespace basisu
 		case texture_format::cBC6HUnsigned:
 		case texture_format::cASTC_HDR_4x4:
 		case texture_format::cUASTC_HDR_4x4:
+		case texture_format::cASTC_HDR_6x6:
 		{
 			// Can't unpack HDR blocks in unpack_block() because it returns 32bpp pixel data.
 			assert(0);
@@ -1487,15 +1490,44 @@ namespace basisu
 	{
 		switch (fmt)
 		{
+			case texture_format::cASTC_HDR_6x6:
+			{
+#if BASISU_USE_GOOGLE_ASTC_DECODER
+				bool status = basisu_astc::astc::decompress_hdr(&pPixels[0][0], (uint8_t*)pBlock, 6, 6);
+				assert(status);
+				if (!status)
+					return false;
+#else
+				// Use our decoder
+				basist::half_float half_block[6 * 6][4];
+
+				astc_helpers::log_astc_block log_blk;
+				if (!astc_helpers::unpack_block(pBlock, log_blk, 6, 6))
+					return false;
+				if (!astc_helpers::decode_block(log_blk, half_block, 6, 6, astc_helpers::cDecodeModeHDR16))
+					return false;
+
+				for (uint32_t p = 0; p < (6 * 6); p++)
+				{
+					pPixels[p][0] = basist::half_to_float(half_block[p][0]);
+					pPixels[p][1] = basist::half_to_float(half_block[p][1]);
+					pPixels[p][2] = basist::half_to_float(half_block[p][2]);
+					pPixels[p][3] = basist::half_to_float(half_block[p][3]);
+				}
+#endif
+				return true;
+			}
 			case texture_format::cASTC_HDR_4x4:
 			case texture_format::cUASTC_HDR_4x4:
 			{
-#if 1
+#if BASISU_USE_GOOGLE_ASTC_DECODER
+				// Use Google's decoder
 				bool status = basisu_astc::astc::decompress_hdr(&pPixels[0][0], (uint8_t*)pBlock, 4, 4);
 				assert(status);
 				if (!status)
 					return false;
 #else
+				// Use our decoder
 				basist::half_float half_block[16][4];
 				
 				astc_helpers::log_astc_block log_blk;
@@ -1592,10 +1624,8 @@ namespace basisu
 
 	bool gpu_image::unpack_hdr(imagef& img) const
 	{
-		if ((m_fmt != texture_format::cASTC_HDR_4x4) && 
-			(m_fmt != texture_format::cUASTC_HDR_4x4) &&
-			(m_fmt != texture_format::cBC6HUnsigned) &&
-			(m_fmt != texture_format::cBC6HSigned))
+		if ((m_fmt != texture_format::cASTC_HDR_4x4) && (m_fmt != texture_format::cUASTC_HDR_4x4) && (m_fmt != texture_format::cASTC_HDR_6x6) &&
+			(m_fmt != texture_format::cBC6HUnsigned) &&	(m_fmt != texture_format::cBC6HSigned))
 		{
 			// Can't call on LDR images, at least currently. (Could unpack the LDR data and convert to float.)
 			assert(0);
@@ -1643,6 +1673,7 @@ namespace basisu
 		KTX_RG = 0x8227,
 		KTX_RGB = 0x1907,
 		KTX_RGBA = 0x1908,
+
 		KTX_COMPRESSED_RGB_S3TC_DXT1_EXT = 0x83F0,
 		KTX_COMPRESSED_RGBA_S3TC_DXT5_EXT = 0x83F3,
 		KTX_COMPRESSED_RED_RGTC1_EXT = 0x8DBB,
@@ -1655,11 +1686,42 @@ namespace basisu
 		KTX_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT = 0x8E8F,
 		KTX_COMPRESSED_RGB_PVRTC_4BPPV1_IMG = 0x8C00,
 		KTX_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG = 0x8C02,
+		
 		KTX_COMPRESSED_RGBA_ASTC_4x4_KHR = 0x93B0,
+		KTX_COMPRESSED_RGBA_ASTC_5x4_KHR = 0x93B1,
+		KTX_COMPRESSED_RGBA_ASTC_5x5_KHR = 0x93B2,
+		KTX_COMPRESSED_RGBA_ASTC_6x5_KHR = 0x93B3,
+		KTX_COMPRESSED_RGBA_ASTC_6x6_KHR = 0x93B4,
+		KTX_COMPRESSED_RGBA_ASTC_8x5_KHR = 0x93B5,
+		KTX_COMPRESSED_RGBA_ASTC_8x6_KHR = 0x93B6,
+		KTX_COMPRESSED_RGBA_ASTC_8x8_KHR = 0x93B7,
+		KTX_COMPRESSED_RGBA_ASTC_10x5_KHR = 0x93B8,
+		KTX_COMPRESSED_RGBA_ASTC_10x6_KHR = 0x93B9,
+		KTX_COMPRESSED_RGBA_ASTC_10x8_KHR = 0x93BA,
+		KTX_COMPRESSED_RGBA_ASTC_10x10_KHR = 0x93BB,
+		KTX_COMPRESSED_RGBA_ASTC_12x10_KHR = 0x93BC,
+		KTX_COMPRESSED_RGBA_ASTC_12x12_KHR = 0x93BD,
+
 		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR = 0x93D0,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR = 0x93D1,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR = 0x93D2,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR = 0x93D3,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR = 0x93D4,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR = 0x93D5,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR = 0x93D6,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR = 0x93D7,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR = 0x93D8,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR = 0x93D9,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR = 0x93DA,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR = 0x93DB,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR = 0x93DC,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR = 0x93DD,
+
 		KTX_COMPRESSED_RGBA_UASTC_4x4_KHR = 0x94CC, // TODO - Use proper value!
+
 		KTX_ATC_RGB_AMD = 0x8C92,
 		KTX_ATC_RGBA_INTERPOLATED_ALPHA_AMD = 0x87EE,
+
 		KTX_COMPRESSED_RGB_FXT1_3DFX = 0x86B0,
 		KTX_COMPRESSED_RGBA_FXT1_3DFX = 0x86B1,
 		KTX_COMPRESSED_RGBA_PVRTC_4BPPV2_IMG = 0x9138,
@@ -1836,6 +1898,13 @@ namespace basisu
 			base_internal_fmt = KTX_RGBA;
 			break;
 		}
+		case texture_format::cASTC_HDR_6x6:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_ASTC_6x6_KHR;
+			// TODO: should we write RGB? We don't support generating HDR 6x6 with alpha.
+			base_internal_fmt = KTX_RGBA; 
+			break;
+		}
 		// We use different enums for HDR vs. LDR ASTC, but internally they are both just ASTC.
 		case texture_format::cASTC_LDR_4x4:
 		case texture_format::cASTC_HDR_4x4:

+ 3146 - 0
thirdparty/basis_universal/encoder/basisu_math.h

@@ -0,0 +1,3146 @@
+// File: basisu_math.h
+#pragma once
+
+// TODO: Would prefer this in the basisu namespace, but to avoid collisions with the existing vec/matrix classes I'm placing this in "bu_math".
+namespace bu_math
+{
+	// Cross-platform 1.0f/sqrtf(x) approximation. See https://en.wikipedia.org/wiki/Fast_inverse_square_root#cite_note-37.
+	// Would prefer using SSE1 etc. but that would require implementing multiple versions and platform divergence (needing more testing).
+	BASISU_FORCE_INLINE float inv_sqrt(float v)
+	{
+		union 
+		{ 
+			float flt; 
+			uint32_t ui; 
+		} un;
+
+		un.flt = v;
+		un.ui = 0x5F1FFFF9UL - (un.ui >> 1);
+
+		return 0.703952253f * un.flt * (2.38924456f - v * (un.flt * un.flt));
+	}
+
+	inline float smoothstep(float edge0, float edge1, float x)
+	{
+		assert(edge1 != edge0);
+
+		// Scale, and clamp x to 0..1 range
+		x = basisu::saturate((x - edge0) / (edge1 - edge0));
+
+		return x * x * (3.0f - 2.0f * x);
+	}
+
+	template <uint32_t N, typename T>
+	class vec : public basisu::rel_ops<vec<N, T> >
+	{
+	public:
+		typedef T scalar_type;
+		enum
+		{
+			num_elements = N
+		};
+
+		inline vec()
+		{
+		}
+
+		inline vec(basisu::eClear)
+		{
+			clear();
+		}
+
+		inline vec(const vec& other)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] = other.m_s[i];
+		}
+
+		template <uint32_t O, typename U>
+		inline vec(const vec<O, U>& other)
+		{
+			set(other);
+		}
+
+		template <uint32_t O, typename U>
+		inline vec(const vec<O, U>& other, T w)
+		{
+			*this = other;
+			m_s[N - 1] = w;
+		}
+
+		template <typename... Args>
+		inline explicit vec(Args... args)
+		{
+			static_assert(sizeof...(args) <= N);
+			set(args...);
+		}
+
+		inline void clear()
+		{
+			if (N > 4)
+				memset(m_s, 0, sizeof(m_s));
+			else
+			{
+				for (uint32_t i = 0; i < N; i++)
+					m_s[i] = 0;
+			}
+		}
+
+		template <uint32_t ON, typename OT>
+		inline vec& set(const vec<ON, OT>& other)
+		{
+			if ((void*)this == (void*)&other)
+				return *this;
+			const uint32_t m = basisu::minimum(N, ON);
+			uint32_t i;
+			for (i = 0; i < m; i++)
+				m_s[i] = static_cast<T>(other[i]);
+			for (; i < N; i++)
+				m_s[i] = 0;
+			return *this;
+		}
+
+		inline vec& set_component(uint32_t index, T val)
+		{
+			assert(index < N);
+			m_s[index] = val;
+			return *this;
+		}
+
+		inline vec& set_all(T val)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] = val;
+			return *this;
+		}
+
+		template <typename... Args>
+		inline vec& set(Args... args)
+		{
+			static_assert(sizeof...(args) <= N);
+
+			// Initialize using parameter pack expansion
+			T values[] = { static_cast<T>(args)... };
+
+			// Special case if setting with a scalar
+			if (sizeof...(args) == 1)
+			{
+				set_all(values[0]);
+			}
+			else
+			{
+				// Copy the values into the vector
+				for (std::size_t i = 0; i < sizeof...(args); ++i)
+				{
+					m_s[i] = values[i];
+				}
+
+				// Zero-initialize the remaining elements (if any)
+				if (sizeof...(args) < N)
+				{
+					std::fill(m_s + sizeof...(args), m_s + N, T{});
+				}
+			}
+
+			return *this;
+		}
+
+		inline vec& set(const T* pValues)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] = pValues[i];
+			return *this;
+		}
+
+		template <uint32_t ON, typename OT>
+		inline vec& swizzle_set(const vec<ON, OT>& other, uint32_t i)
+		{
+			return set(static_cast<T>(other[i]));
+		}
+
+		template <uint32_t ON, typename OT>
+		inline vec& swizzle_set(const vec<ON, OT>& other, uint32_t i, uint32_t j)
+		{
+			return set(static_cast<T>(other[i]), static_cast<T>(other[j]));
+		}
+
+		template <uint32_t ON, typename OT>
+		inline vec& swizzle_set(const vec<ON, OT>& other, uint32_t i, uint32_t j, uint32_t k)
+		{
+			return set(static_cast<T>(other[i]), static_cast<T>(other[j]), static_cast<T>(other[k]));
+		}
+
+		template <uint32_t ON, typename OT>
+		inline vec& swizzle_set(const vec<ON, OT>& other, uint32_t i, uint32_t j, uint32_t k, uint32_t l)
+		{
+			return set(static_cast<T>(other[i]), static_cast<T>(other[j]), static_cast<T>(other[k]), static_cast<T>(other[l]));
+		}
+
+		inline vec& operator=(const vec& rhs)
+		{
+			if (this != &rhs)
+			{
+				for (uint32_t i = 0; i < N; i++)
+					m_s[i] = rhs.m_s[i];
+			}
+			return *this;
+		}
+
+		template <uint32_t O, typename U>
+		inline vec& operator=(const vec<O, U>& other)
+		{
+			if ((void*)this == (void*)&other)
+				return *this;
+
+			uint32_t s = basisu::minimum(N, O);
+
+			uint32_t i;
+			for (i = 0; i < s; i++)
+				m_s[i] = static_cast<T>(other[i]);
+
+			for (; i < N; i++)
+				m_s[i] = 0;
+
+			return *this;
+		}
+
+		inline bool operator==(const vec& rhs) const
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if (!(m_s[i] == rhs.m_s[i]))
+					return false;
+			return true;
+		}
+
+		inline bool operator<(const vec& rhs) const
+		{
+			for (uint32_t i = 0; i < N; i++)
+			{
+				if (m_s[i] < rhs.m_s[i])
+					return true;
+				else if (!(m_s[i] == rhs.m_s[i]))
+					return false;
+			}
+
+			return false;
+		}
+
+		inline T operator[](uint32_t i) const
+		{
+			assert(i < N);
+			return m_s[i];
+		}
+
+		inline T& operator[](uint32_t i)
+		{
+			assert(i < N);
+			return m_s[i];
+		}
+
+		template <uint32_t index>
+		inline uint64_t get_component_bits_as_uint() const
+		{
+			static_assert(index < N);
+			static_assert((sizeof(T) == sizeof(uint16_t)) || (sizeof(T) == sizeof(uint32_t)) || (sizeof(T) == sizeof(uint64_t)), "Unsupported type");
+
+			if (sizeof(T) == sizeof(uint16_t))
+				return *reinterpret_cast<const uint16_t*>(&m_s[index]);
+			else if (sizeof(T) == sizeof(uint32_t))
+				return *reinterpret_cast<const uint32_t*>(&m_s[index]);
+			else if (sizeof(T) == sizeof(uint64_t))
+				return *reinterpret_cast<const uint64_t*>(&m_s[index]);
+			else
+			{
+				assert(0);
+				return 0;
+			}
+		}
+
+		inline T get_x(void) const
+		{
+			return m_s[0];
+		}
+		inline T get_y(void) const
+		{
+			static_assert(N >= 2);
+			return m_s[1];
+		}
+		inline T get_z(void) const
+		{
+			static_assert(N >= 3);
+			return m_s[2];
+		}
+		inline T get_w(void) const
+		{
+			static_assert(N >= 4);
+			return m_s[3];
+		}
+
+		inline vec get_x_vector() const
+		{
+			return broadcast<0>();
+		}
+		inline vec get_y_vector() const
+		{
+			return broadcast<1>();
+		}
+		inline vec get_z_vector() const
+		{
+			return broadcast<2>();
+		}
+		inline vec get_w_vector() const
+		{
+			return broadcast<3>();
+		}
+
+		inline T get_component(uint32_t i) const
+		{
+			return (*this)[i];
+		}
+
+		inline vec& set_x(T v)
+		{
+			m_s[0] = v;
+			return *this;
+		}
+		inline vec& set_y(T v)
+		{
+			static_assert(N >= 2);
+			m_s[1] = v;
+			return *this;
+		}
+		inline vec& set_z(T v)
+		{
+			static_assert(N >= 3);
+			m_s[2] = v;
+			return *this;
+		}
+		inline vec& set_w(T v)
+		{
+			static_assert(N >= 4);
+			m_s[3] = v;
+			return *this;
+		}
+
+		inline const T* get_ptr() const
+		{
+			return reinterpret_cast<const T*>(&m_s[0]);
+		}
+		inline T* get_ptr()
+		{
+			return reinterpret_cast<T*>(&m_s[0]);
+		}
+
+		inline vec as_point() const
+		{
+			vec result(*this);
+			result[N - 1] = 1;
+			return result;
+		}
+
+		inline vec as_dir() const
+		{
+			vec result(*this);
+			result[N - 1] = 0;
+			return result;
+		}
+
+		inline vec<2, T> select2(uint32_t i, uint32_t j) const
+		{
+			assert((i < N) && (j < N));
+			return vec<2, T>(m_s[i], m_s[j]);
+		}
+
+		inline vec<3, T> select3(uint32_t i, uint32_t j, uint32_t k) const
+		{
+			assert((i < N) && (j < N) && (k < N));
+			return vec<3, T>(m_s[i], m_s[j], m_s[k]);
+		}
+
+		inline vec<4, T> select4(uint32_t i, uint32_t j, uint32_t k, uint32_t l) const
+		{
+			assert((i < N) && (j < N) && (k < N) && (l < N));
+			return vec<4, T>(m_s[i], m_s[j], m_s[k], m_s[l]);
+		}
+
+		inline bool is_dir() const
+		{
+			return m_s[N - 1] == 0;
+		}
+		inline bool is_vector() const
+		{
+			return is_dir();
+		}
+		inline bool is_point() const
+		{
+			return m_s[N - 1] == 1;
+		}
+
+		inline vec project() const
+		{
+			vec result(*this);
+			if (result[N - 1])
+				result /= result[N - 1];
+			return result;
+		}
+
+		inline vec broadcast(unsigned i) const
+		{
+			return vec((*this)[i]);
+		}
+
+		template <uint32_t i>
+		inline vec broadcast() const
+		{
+			return vec((*this)[i]);
+		}
+
+		inline vec swizzle(uint32_t i, uint32_t j) const
+		{
+			return vec((*this)[i], (*this)[j]);
+		}
+
+		inline vec swizzle(uint32_t i, uint32_t j, uint32_t k) const
+		{
+			return vec((*this)[i], (*this)[j], (*this)[k]);
+		}
+
+		inline vec swizzle(uint32_t i, uint32_t j, uint32_t k, uint32_t l) const
+		{
+			return vec((*this)[i], (*this)[j], (*this)[k], (*this)[l]);
+		}
+
+		inline vec operator-() const
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result.m_s[i] = -m_s[i];
+			return result;
+		}
+
+		inline vec operator+() const
+		{
+			return *this;
+		}
+
+		inline vec& operator+=(const vec& other)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] += other.m_s[i];
+			return *this;
+		}
+
+		inline vec& operator-=(const vec& other)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] -= other.m_s[i];
+			return *this;
+		}
+
+		inline vec& operator*=(const vec& other)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] *= other.m_s[i];
+			return *this;
+		}
+
+		inline vec& operator/=(const vec& other)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] /= other.m_s[i];
+			return *this;
+		}
+
+		inline vec& operator*=(T s)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] *= s;
+			return *this;
+		}
+
+		inline vec& operator/=(T s)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] /= s;
+			return *this;
+		}
+
+		friend inline vec operator*(const vec& lhs, T val)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result.m_s[i] = lhs.m_s[i] * val;
+			return result;
+		}
+
+		friend inline vec operator*(T val, const vec& rhs)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result.m_s[i] = val * rhs.m_s[i];
+			return result;
+		}
+
+		friend inline vec operator/(const vec& lhs, const vec& rhs)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result.m_s[i] = lhs.m_s[i] / rhs.m_s[i];
+			return result;
+		}
+
+		friend inline vec operator/(const vec& lhs, T val)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result.m_s[i] = lhs.m_s[i] / val;
+			return result;
+		}
+
+		friend inline vec operator+(const vec& lhs, const vec& rhs)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result.m_s[i] = lhs.m_s[i] + rhs.m_s[i];
+			return result;
+		}
+
+		friend inline vec operator-(const vec& lhs, const vec& rhs)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result.m_s[i] = lhs.m_s[i] - rhs.m_s[i];
+			return result;
+		}
+
+		static inline vec<3, T> cross2(const vec& a, const vec& b)
+		{
+			static_assert(N >= 2);
+			return vec<3, T>(0, 0, a[0] * b[1] - a[1] * b[0]);
+		}
+
+		inline vec<3, T> cross2(const vec& b) const
+		{
+			return cross2(*this, b);
+		}
+
+		static inline vec<3, T> cross3(const vec& a, const vec& b)
+		{
+			static_assert(N >= 3);
+			return vec<3, T>(a[1] * b[2] - a[2] * b[1], a[2] * b[0] - a[0] * b[2], a[0] * b[1] - a[1] * b[0]);
+		}
+
+		inline vec<3, T> cross3(const vec& b) const
+		{
+			return cross3(*this, b);
+		}
+
+		static inline vec<3, T> cross(const vec& a, const vec& b)
+		{
+			static_assert(N >= 2);
+
+			if (N == 2)
+				return cross2(a, b);
+			else
+				return cross3(a, b);
+		}
+
+		inline vec<3, T> cross(const vec& b) const
+		{
+			static_assert(N >= 2);
+			return cross(*this, b);
+		}
+
+		inline T dot(const vec& rhs) const
+		{
+			return dot(*this, rhs);
+		}
+
+		inline vec dot_vector(const vec& rhs) const
+		{
+			return vec(dot(*this, rhs));
+		}
+
+		static inline T dot(const vec& lhs, const vec& rhs)
+		{
+			T result = lhs.m_s[0] * rhs.m_s[0];
+			for (uint32_t i = 1; i < N; i++)
+				result += lhs.m_s[i] * rhs.m_s[i];
+			return result;
+		}
+
+		inline T dot2(const vec& rhs) const
+		{
+			static_assert(N >= 2);
+			return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1];
+		}
+
+		inline T dot3(const vec& rhs) const
+		{
+			static_assert(N >= 3);
+			return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1] + m_s[2] * rhs.m_s[2];
+		}
+
+		inline T dot4(const vec& rhs) const
+		{
+			static_assert(N >= 4);
+			return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1] + m_s[2] * rhs.m_s[2] + m_s[3] * rhs.m_s[3];
+		}
+
+		inline T norm(void) const
+		{
+			T sum = m_s[0] * m_s[0];
+			for (uint32_t i = 1; i < N; i++)
+				sum += m_s[i] * m_s[i];
+			return sum;
+		}
+
+		inline T length(void) const
+		{
+			return sqrt(norm());
+		}
+
+		inline T squared_distance(const vec& rhs) const
+		{
+			T dist2 = 0;
+			for (uint32_t i = 0; i < N; i++)
+			{
+				T d = m_s[i] - rhs.m_s[i];
+				dist2 += d * d;
+			}
+			return dist2;
+		}
+
+		inline T squared_distance(const vec& rhs, T early_out) const
+		{
+			T dist2 = 0;
+			for (uint32_t i = 0; i < N; i++)
+			{
+				T d = m_s[i] - rhs.m_s[i];
+				dist2 += d * d;
+				if (dist2 > early_out)
+					break;
+			}
+			return dist2;
+		}
+
+		inline T distance(const vec& rhs) const
+		{
+			T dist2 = 0;
+			for (uint32_t i = 0; i < N; i++)
+			{
+				T d = m_s[i] - rhs.m_s[i];
+				dist2 += d * d;
+			}
+			return sqrt(dist2);
+		}
+
+		inline vec inverse() const
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result[i] = m_s[i] ? (1.0f / m_s[i]) : 0;
+			return result;
+		}
+
+		// returns squared length (norm)
+		inline double normalize(const vec* pDefaultVec = NULL)
+		{
+			double n = m_s[0] * m_s[0];
+			for (uint32_t i = 1; i < N; i++)
+				n += m_s[i] * m_s[i];
+
+			if (n != 0)
+				*this *= static_cast<T>(1.0f / sqrt(n));
+			else if (pDefaultVec)
+				*this = *pDefaultVec;
+			return n;
+		}
+
+		inline double normalize3(const vec* pDefaultVec = NULL)
+		{
+			static_assert(N >= 3);
+
+			double n = m_s[0] * m_s[0] + m_s[1] * m_s[1] + m_s[2] * m_s[2];
+
+			if (n != 0)
+				*this *= static_cast<T>((1.0f / sqrt(n)));
+			else if (pDefaultVec)
+				*this = *pDefaultVec;
+			return n;
+		}
+
+		inline vec& normalize_in_place(const vec* pDefaultVec = NULL)
+		{
+			normalize(pDefaultVec);
+			return *this;
+		}
+
+		inline vec& normalize3_in_place(const vec* pDefaultVec = NULL)
+		{
+			normalize3(pDefaultVec);
+			return *this;
+		}
+
+		inline vec get_normalized(const vec* pDefaultVec = NULL) const
+		{
+			vec result(*this);
+			result.normalize(pDefaultVec);
+			return result;
+		}
+
+		inline vec get_normalized3(const vec* pDefaultVec = NULL) const
+		{
+			vec result(*this);
+			result.normalize3(pDefaultVec);
+			return result;
+		}
+
+		inline vec& clamp(T l, T h)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] = static_cast<T>(basisu::clamp(m_s[i], l, h));
+			return *this;
+		}
+
+		inline vec& saturate()
+		{
+			return clamp(0.0f, 1.0f);
+		}
+
+		inline vec& clamp(const vec& l, const vec& h)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_s[i] = static_cast<T>(basisu::clamp(m_s[i], l[i], h[i]));
+			return *this;
+		}
+
+		inline bool is_within_bounds(const vec& l, const vec& h) const
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if ((m_s[i] < l[i]) || (m_s[i] > h[i]))
+					return false;
+
+			return true;
+		}
+
+		inline bool is_within_bounds(T l, T h) const
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if ((m_s[i] < l) || (m_s[i] > h))
+					return false;
+
+			return true;
+		}
+
+		inline uint32_t get_major_axis(void) const
+		{
+			T m = fabs(m_s[0]);
+			uint32_t r = 0;
+			for (uint32_t i = 1; i < N; i++)
+			{
+				const T c = fabs(m_s[i]);
+				if (c > m)
+				{
+					m = c;
+					r = i;
+				}
+			}
+			return r;
+		}
+
+		inline uint32_t get_minor_axis(void) const
+		{
+			T m = fabs(m_s[0]);
+			uint32_t r = 0;
+			for (uint32_t i = 1; i < N; i++)
+			{
+				const T c = fabs(m_s[i]);
+				if (c < m)
+				{
+					m = c;
+					r = i;
+				}
+			}
+			return r;
+		}
+
+		inline void get_projection_axes(uint32_t& u, uint32_t& v) const
+		{
+			const int axis = get_major_axis();
+			if (m_s[axis] < 0.0f)
+			{
+				v = basisu::next_wrap<uint32_t>(axis, N);
+				u = basisu::next_wrap<uint32_t>(v, N);
+			}
+			else
+			{
+				u = basisu::next_wrap<uint32_t>(axis, N);
+				v = basisu::next_wrap<uint32_t>(u, N);
+			}
+		}
+
+		inline T get_absolute_minimum(void) const
+		{
+			T result = fabs(m_s[0]);
+			for (uint32_t i = 1; i < N; i++)
+				result = basisu::minimum(result, fabs(m_s[i]));
+			return result;
+		}
+
+		inline T get_absolute_maximum(void) const
+		{
+			T result = fabs(m_s[0]);
+			for (uint32_t i = 1; i < N; i++)
+				result = basisu::maximum(result, fabs(m_s[i]));
+			return result;
+		}
+
+		inline T get_minimum(void) const
+		{
+			T result = m_s[0];
+			for (uint32_t i = 1; i < N; i++)
+				result = basisu::minimum(result, m_s[i]);
+			return result;
+		}
+
+		inline T get_maximum(void) const
+		{
+			T result = m_s[0];
+			for (uint32_t i = 1; i < N; i++)
+				result = basisu::maximum(result, m_s[i]);
+			return result;
+		}
+
+		inline vec& remove_unit_direction(const vec& dir)
+		{
+			*this -= (dot(dir) * dir);
+			return *this;
+		}
+
+		inline vec get_remove_unit_direction(const vec& dir) const
+		{
+			return *this - (dot(dir) * dir);
+		}
+
+		inline bool all_less(const vec& b) const
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if (m_s[i] >= b.m_s[i])
+					return false;
+			return true;
+		}
+
+		inline bool all_less_equal(const vec& b) const
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if (m_s[i] > b.m_s[i])
+					return false;
+			return true;
+		}
+
+		inline bool all_greater(const vec& b) const
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if (m_s[i] <= b.m_s[i])
+					return false;
+			return true;
+		}
+
+		inline bool all_greater_equal(const vec& b) const
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if (m_s[i] < b.m_s[i])
+					return false;
+			return true;
+		}
+
+		inline vec negate_xyz() const
+		{
+			vec ret;
+
+			ret[0] = -m_s[0];
+			if (N >= 2)
+				ret[1] = -m_s[1];
+			if (N >= 3)
+				ret[2] = -m_s[2];
+
+			for (uint32_t i = 3; i < N; i++)
+				ret[i] = m_s[i];
+
+			return ret;
+		}
+
+		inline vec& invert()
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if (m_s[i] != 0.0f)
+					m_s[i] = 1.0f / m_s[i];
+			return *this;
+		}
+
+		inline scalar_type perp_dot(const vec& b) const
+		{
+			static_assert(N == 2);
+			return m_s[0] * b.m_s[1] - m_s[1] * b.m_s[0];
+		}
+
+		inline vec perp() const
+		{
+			static_assert(N == 2);
+			return vec(-m_s[1], m_s[0]);
+		}
+
+		inline vec get_floor() const
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result[i] = floor(m_s[i]);
+			return result;
+		}
+
+		inline vec get_ceil() const
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result[i] = ceil(m_s[i]);
+			return result;
+		}
+
+		inline T get_total() const
+		{
+			T res = m_s[0];
+			for (uint32_t i = 1; i < N; i++)
+				res += m_s[i];
+			return res;
+		}
+
+		// static helper methods
+
+		static inline vec mul_components(const vec& lhs, const vec& rhs)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result[i] = lhs.m_s[i] * rhs.m_s[i];
+			return result;
+		}
+
+		static inline vec mul_add_components(const vec& a, const vec& b, const vec& c)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result[i] = a.m_s[i] * b.m_s[i] + c.m_s[i];
+			return result;
+		}
+
+		static inline vec make_axis(uint32_t i)
+		{
+			vec result;
+			result.clear();
+			result[i] = 1;
+			return result;
+		}
+
+		static inline vec equals_mask(const vec& a, const vec& b)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret[i] = (a[i] == b[i]);
+			return ret;
+		}
+
+		static inline vec not_equals_mask(const vec& a, const vec& b)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret[i] = (a[i] != b[i]);
+			return ret;
+		}
+
+		static inline vec less_mask(const vec& a, const vec& b)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret[i] = (a[i] < b[i]);
+			return ret;
+		}
+
+		static inline vec less_equals_mask(const vec& a, const vec& b)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret[i] = (a[i] <= b[i]);
+			return ret;
+		}
+
+		static inline vec greater_equals_mask(const vec& a, const vec& b)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret[i] = (a[i] >= b[i]);
+			return ret;
+		}
+
+		static inline vec greater_mask(const vec& a, const vec& b)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret[i] = (a[i] > b[i]);
+			return ret;
+		}
+
+		static inline vec component_max(const vec& a, const vec& b)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret.m_s[i] = basisu::maximum(a.m_s[i], b.m_s[i]);
+			return ret;
+		}
+
+		static inline vec component_min(const vec& a, const vec& b)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret.m_s[i] = basisu::minimum(a.m_s[i], b.m_s[i]);
+			return ret;
+		}
+
+		static inline vec lerp(const vec& a, const vec& b, float t)
+		{
+			vec ret;
+			for (uint32_t i = 0; i < N; i++)
+				ret.m_s[i] = a.m_s[i] + (b.m_s[i] - a.m_s[i]) * t;
+			return ret;
+		}
+
+		static inline bool equal_tol(const vec& a, const vec& b, float t)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				if (!basisu::equal_tol(a.m_s[i], b.m_s[i], t))
+					return false;
+			return true;
+		}
+
+		inline bool equal_tol(const vec& b, float t) const
+		{
+			return equal_tol(*this, b, t);
+		}
+
+		static inline vec make_random(basisu::rand& r, float l, float h)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result[i] = r.frand(l, h);
+			return result;
+		}
+
+		static inline vec make_random(basisu::rand& r, const vec& l, const vec& h)
+		{
+			vec result;
+			for (uint32_t i = 0; i < N; i++)
+				result[i] = r.frand(l[i], h[i]);
+			return result;
+		}
+
+		void print() const
+		{
+			for (uint32_t c = 0; c < N; c++)
+				printf("%3.3f ", (*this)[c]);
+			printf("\n");
+		}
+
+	protected:
+		T m_s[N];
+	};
+
+	typedef vec<1, double> vec1D;
+	typedef vec<2, double> vec2D;
+	typedef vec<3, double> vec3D;
+	typedef vec<4, double> vec4D;
+
+	typedef vec<1, float> vec1F;
+
+	typedef vec<2, float> vec2F;
+	typedef basisu::vector<vec2F> vec2F_array;
+
+	typedef vec<3, float> vec3F;
+	typedef basisu::vector<vec3F> vec3F_array;
+
+	typedef vec<4, float> vec4F;
+	typedef basisu::vector<vec4F> vec4F_array;
+
+	typedef vec<2, uint32_t> vec2U;
+	typedef vec<3, uint32_t> vec3U;
+	typedef vec<2, int> vec2I;
+	typedef vec<3, int> vec3I;
+	typedef vec<4, int> vec4I;
+
+	typedef vec<2, int16_t> vec2I16;
+	typedef vec<3, int16_t> vec3I16;
+
+	inline vec2F rotate_point_2D(const vec2F& p, float rad)
+	{
+		float c = cosf(rad);
+		float s = sinf(rad);
+
+		float x = p[0];
+		float y = p[1];
+
+		return vec2F(x * c - y * s, x * s + y * c);
+	}
+
+	//--------------------------------------------------------------
+
+	// Matrix/vector cheat sheet, because confusingly, depending on how matrices are stored in memory people can use opposite definitions of "rows", "cols", etc.
+	// See http://www.mindcontrol.org/~hplus/graphics/matrix-layout.html
+	//
+	// So in this simple row-major general matrix class:
+	// matrix=[NumRows][NumCols] or [R][C], i.e. a 3x3 matrix stored in memory will appear as: R0C0, R0C1, R0C2,  R1C0, R1C1, R1C2,  etc.
+	// Matrix multiplication: [R0,C0]*[R1,C1]=[R0,C1], C0 must equal R1
+	//
+	// In this class:
+	// A "row vector" type is a vector of size # of matrix cols, 1xC. It's the vector type that is used to store the matrix rows.
+	// A "col vector" type is a vector of size # of matrix rows, Rx1. It's a vector type large enough to hold each matrix column.
+	//
+	// Subrow/col vectors: last component is assumed to be either 0 (a "vector") or 1 (a "point")
+	// "subrow vector": vector/point of size # cols-1, 1x(C-1)
+	// "subcol vector": vector/point of size # rows-1, (R-1)x1
+	//
+	// D3D style:
+	// vec*matrix, row vector on left (vec dotted against columns)
+	// [1,4]*[4,4]=[1,4]
+	// abcd * A B C D
+	//        A B C D
+	//        A B C D
+	//        A B C D
+	// =      e f g h
+	//
+	// Now confusingly, in the matrix transform method for vec*matrix below the vector's type is "col_vec", because col_vec will have the proper size for non-square matrices. But the vector on the left is written as row vector, argh.
+	//
+	//
+	// OGL style:
+	// matrix*vec, col vector on right (vec dotted against rows):
+	// [4,4]*[4,1]=[4,1]
+	//
+	// A B C D * e = e
+	// A B C D   f   f
+	// A B C D   g   g
+	// A B C D   h   h
+
+	template <class X, class Y, class Z>
+	Z& matrix_mul_helper(Z& result, const X& lhs, const Y& rhs)
+	{
+		static_assert((int)Z::num_rows == (int)X::num_rows);
+		static_assert((int)Z::num_cols == (int)Y::num_cols);
+		static_assert((int)X::num_cols == (int)Y::num_rows);
+		assert(((void*)&result != (void*)&lhs) && ((void*)&result != (void*)&rhs));
+		for (int r = 0; r < X::num_rows; r++)
+			for (int c = 0; c < Y::num_cols; c++)
+			{
+				typename Z::scalar_type s = lhs(r, 0) * rhs(0, c);
+				for (uint32_t i = 1; i < X::num_cols; i++)
+					s += lhs(r, i) * rhs(i, c);
+				result(r, c) = s;
+			}
+		return result;
+	}
+
+	template <class X, class Y, class Z>
+	Z& matrix_mul_helper_transpose_lhs(Z& result, const X& lhs, const Y& rhs)
+	{
+		static_assert((int)Z::num_rows == (int)X::num_cols);
+		static_assert((int)Z::num_cols == (int)Y::num_cols);
+		static_assert((int)X::num_rows == (int)Y::num_rows);
+		assert(((void*)&result != (void*)&lhs) && ((void*)&result != (void*)&rhs));
+		for (int r = 0; r < X::num_cols; r++)
+			for (int c = 0; c < Y::num_cols; c++)
+			{
+				typename Z::scalar_type s = lhs(0, r) * rhs(0, c);
+				for (uint32_t i = 1; i < X::num_rows; i++)
+					s += lhs(i, r) * rhs(i, c);
+				result(r, c) = s;
+			}
+		return result;
+	}
+
+	template <class X, class Y, class Z>
+	Z& matrix_mul_helper_transpose_rhs(Z& result, const X& lhs, const Y& rhs)
+	{
+		static_assert((int)Z::num_rows == (int)X::num_rows);
+		static_assert((int)Z::num_cols == (int)Y::num_rows);
+		static_assert((int)X::num_cols == (int)Y::num_cols);
+		assert(((void*)&result != (void*)&lhs) && ((void*)&result != (void*)&rhs));
+		for (int r = 0; r < X::num_rows; r++)
+			for (int c = 0; c < Y::num_rows; c++)
+			{
+				typename Z::scalar_type s = lhs(r, 0) * rhs(c, 0);
+				for (uint32_t i = 1; i < X::num_cols; i++)
+					s += lhs(r, i) * rhs(c, i);
+				result(r, c) = s;
+			}
+		return result;
+	}
+		
+	template <uint32_t R, uint32_t C, typename T>
+	class matrix
+	{
+	public:
+		typedef T scalar_type;
+		enum
+		{
+			num_rows = R,
+			num_cols = C
+		};
+
+		typedef vec<R, T> col_vec;
+		typedef vec < (R > 1) ? (R - 1) : 0, T > subcol_vec;
+
+		typedef vec<C, T> row_vec;
+		typedef vec < (C > 1) ? (C - 1) : 0, T > subrow_vec;
+
+		inline matrix()
+		{
+		}
+
+		inline matrix(basisu::eClear)
+		{
+			clear();
+		}
+
+		inline matrix(basisu::eIdentity)
+		{
+			set_identity_matrix();
+		}
+
+		inline matrix(const T* p)
+		{
+			set(p);
+		}
+
+		inline matrix(const matrix& other)
+		{
+			for (uint32_t i = 0; i < R; i++)
+				m_rows[i] = other.m_rows[i];
+		}
+
+		inline matrix& operator=(const matrix& rhs)
+		{
+			if (this != &rhs)
+				for (uint32_t i = 0; i < R; i++)
+					m_rows[i] = rhs.m_rows[i];
+			return *this;
+		}
+
+		inline matrix(T val00, T val01,
+			T val10, T val11)
+		{
+			set(val00, val01, val10, val11);
+		}
+
+		inline matrix(T val00, T val01,
+			T val10, T val11,
+			T val20, T val21)
+		{
+			set(val00, val01, val10, val11, val20, val21);
+		}
+
+		inline matrix(T val00, T val01, T val02,
+			T val10, T val11, T val12,
+			T val20, T val21, T val22)
+		{
+			set(val00, val01, val02, val10, val11, val12, val20, val21, val22);
+		}
+
+		inline matrix(T val00, T val01, T val02, T val03,
+			T val10, T val11, T val12, T val13,
+			T val20, T val21, T val22, T val23,
+			T val30, T val31, T val32, T val33)
+		{
+			set(val00, val01, val02, val03, val10, val11, val12, val13, val20, val21, val22, val23, val30, val31, val32, val33);
+		}
+
+		inline matrix(T val00, T val01, T val02, T val03,
+			T val10, T val11, T val12, T val13,
+			T val20, T val21, T val22, T val23)
+		{
+			set(val00, val01, val02, val03, val10, val11, val12, val13, val20, val21, val22, val23);
+		}
+
+		inline void set(const float* p)
+		{
+			for (uint32_t i = 0; i < R; i++)
+			{
+				m_rows[i].set(p);
+				p += C;
+			}
+		}
+
+		inline void set(T val00, T val01,
+			T val10, T val11)
+		{
+			m_rows[0].set(val00, val01);
+			if (R >= 2)
+			{
+				m_rows[1].set(val10, val11);
+
+				for (uint32_t i = 2; i < R; i++)
+					m_rows[i].clear();
+			}
+		}
+
+		inline void set(T val00, T val01,
+			T val10, T val11,
+			T val20, T val21)
+		{
+			m_rows[0].set(val00, val01);
+			if (R >= 2)
+			{
+				m_rows[1].set(val10, val11);
+
+				if (R >= 3)
+				{
+					m_rows[2].set(val20, val21);
+
+					for (uint32_t i = 3; i < R; i++)
+						m_rows[i].clear();
+				}
+			}
+		}
+
+		inline void set(T val00, T val01, T val02,
+			T val10, T val11, T val12,
+			T val20, T val21, T val22)
+		{
+			m_rows[0].set(val00, val01, val02);
+			if (R >= 2)
+			{
+				m_rows[1].set(val10, val11, val12);
+				if (R >= 3)
+				{
+					m_rows[2].set(val20, val21, val22);
+
+					for (uint32_t i = 3; i < R; i++)
+						m_rows[i].clear();
+				}
+			}
+		}
+
+		inline void set(T val00, T val01, T val02, T val03,
+			T val10, T val11, T val12, T val13,
+			T val20, T val21, T val22, T val23,
+			T val30, T val31, T val32, T val33)
+		{
+			m_rows[0].set(val00, val01, val02, val03);
+			if (R >= 2)
+			{
+				m_rows[1].set(val10, val11, val12, val13);
+				if (R >= 3)
+				{
+					m_rows[2].set(val20, val21, val22, val23);
+
+					if (R >= 4)
+					{
+						m_rows[3].set(val30, val31, val32, val33);
+
+						for (uint32_t i = 4; i < R; i++)
+							m_rows[i].clear();
+					}
+				}
+			}
+		}
+
+		inline void set(T val00, T val01, T val02, T val03,
+			T val10, T val11, T val12, T val13,
+			T val20, T val21, T val22, T val23)
+		{
+			m_rows[0].set(val00, val01, val02, val03);
+			if (R >= 2)
+			{
+				m_rows[1].set(val10, val11, val12, val13);
+				if (R >= 3)
+				{
+					m_rows[2].set(val20, val21, val22, val23);
+
+					for (uint32_t i = 3; i < R; i++)
+						m_rows[i].clear();
+				}
+			}
+		}
+
+		inline uint32_t get_num_rows() const
+		{
+			return num_rows;
+		}
+
+		inline uint32_t get_num_cols() const
+		{
+			return num_cols;
+		}
+
+		inline uint32_t get_total_elements() const
+		{
+			return num_rows * num_cols;
+		}
+
+		inline T operator()(uint32_t r, uint32_t c) const
+		{
+			assert((r < R) && (c < C));
+			return m_rows[r][c];
+		}
+
+		inline T& operator()(uint32_t r, uint32_t c)
+		{
+			assert((r < R) && (c < C));
+			return m_rows[r][c];
+		}
+
+		inline const row_vec& operator[](uint32_t r) const
+		{
+			assert(r < R);
+			return m_rows[r];
+		}
+
+		inline row_vec& operator[](uint32_t r)
+		{
+			assert(r < R);
+			return m_rows[r];
+		}
+
+		inline const row_vec& get_row(uint32_t r) const
+		{
+			return (*this)[r];
+		}
+
+		inline row_vec& get_row(uint32_t r)
+		{
+			return (*this)[r];
+		}
+
+		inline void set_row(uint32_t r, const row_vec& v)
+		{
+			(*this)[r] = v;
+		}
+
+		inline col_vec get_col(uint32_t c) const
+		{
+			assert(c < C);
+			col_vec result;
+			for (uint32_t i = 0; i < R; i++)
+				result[i] = m_rows[i][c];
+			return result;
+		}
+
+		inline void set_col(uint32_t c, const col_vec& col)
+		{
+			assert(c < C);
+			for (uint32_t i = 0; i < R; i++)
+				m_rows[i][c] = col[i];
+		}
+
+		inline void set_col(uint32_t c, const subcol_vec& col)
+		{
+			assert(c < C);
+			for (uint32_t i = 0; i < (R - 1); i++)
+				m_rows[i][c] = col[i];
+
+			m_rows[R - 1][c] = 0.0f;
+		}
+
+		inline const row_vec& get_translate() const
+		{
+			return m_rows[R - 1];
+		}
+
+		inline matrix& set_translate(const row_vec& r)
+		{
+			m_rows[R - 1] = r;
+			return *this;
+		}
+
+		inline matrix& set_translate(const subrow_vec& r)
+		{
+			m_rows[R - 1] = row_vec(r).as_point();
+			return *this;
+		}
+
+		inline const T* get_ptr() const
+		{
+			return reinterpret_cast<const T*>(&m_rows[0]);
+		}
+		inline T* get_ptr()
+		{
+			return reinterpret_cast<T*>(&m_rows[0]);
+		}
+
+		inline matrix& operator+=(const matrix& other)
+		{
+			for (uint32_t i = 0; i < R; i++)
+				m_rows[i] += other.m_rows[i];
+			return *this;
+		}
+
+		inline matrix& operator-=(const matrix& other)
+		{
+			for (uint32_t i = 0; i < R; i++)
+				m_rows[i] -= other.m_rows[i];
+			return *this;
+		}
+
+		inline matrix& operator*=(T val)
+		{
+			for (uint32_t i = 0; i < R; i++)
+				m_rows[i] *= val;
+			return *this;
+		}
+
+		inline matrix& operator/=(T val)
+		{
+			for (uint32_t i = 0; i < R; i++)
+				m_rows[i] /= val;
+			return *this;
+		}
+
+		inline matrix& operator*=(const matrix& other)
+		{
+			matrix result;
+			matrix_mul_helper(result, *this, other);
+			*this = result;
+			return *this;
+		}
+
+		friend inline matrix operator+(const matrix& lhs, const matrix& rhs)
+		{
+			matrix result;
+			for (uint32_t i = 0; i < R; i++)
+				result[i] = lhs.m_rows[i] + rhs.m_rows[i];
+			return result;
+		}
+
+		friend inline matrix operator-(const matrix& lhs, const matrix& rhs)
+		{
+			matrix result;
+			for (uint32_t i = 0; i < R; i++)
+				result[i] = lhs.m_rows[i] - rhs.m_rows[i];
+			return result;
+		}
+
+		friend inline matrix operator*(const matrix& lhs, T val)
+		{
+			matrix result;
+			for (uint32_t i = 0; i < R; i++)
+				result[i] = lhs.m_rows[i] * val;
+			return result;
+		}
+
+		friend inline matrix operator/(const matrix& lhs, T val)
+		{
+			matrix result;
+			for (uint32_t i = 0; i < R; i++)
+				result[i] = lhs.m_rows[i] / val;
+			return result;
+		}
+
+		friend inline matrix operator*(T val, const matrix& rhs)
+		{
+			matrix result;
+			for (uint32_t i = 0; i < R; i++)
+				result[i] = val * rhs.m_rows[i];
+			return result;
+		}
+
+#if 0
+		template<uint32_t R0, uint32_t C0, uint32_t R1, uint32_t C1, typename T>
+		friend inline matrix operator*(const matrix<R0, C0, T>& lhs, const matrix<R1, C1, T>& rhs)
+		{
+			matrix<R0, C1, T> result;
+			return matrix_mul_helper(result, lhs, rhs);
+		}
+#endif
+		friend inline matrix operator*(const matrix& lhs, const matrix& rhs)
+		{
+			matrix result;
+			return matrix_mul_helper(result, lhs, rhs);
+		}
+
+		friend inline row_vec operator*(const col_vec& a, const matrix& b)
+		{
+			return transform(a, b);
+		}
+
+		inline matrix operator+() const
+		{
+			return *this;
+		}
+
+		inline matrix operator-() const
+		{
+			matrix result;
+			for (uint32_t i = 0; i < R; i++)
+				result[i] = -m_rows[i];
+			return result;
+		}
+
+		inline matrix& clear()
+		{
+			for (uint32_t i = 0; i < R; i++)
+				m_rows[i].clear();
+			return *this;
+		}
+
+		inline matrix& set_zero_matrix()
+		{
+			clear();
+			return *this;
+		}
+
+		inline matrix& set_identity_matrix()
+		{
+			for (uint32_t i = 0; i < R; i++)
+			{
+				m_rows[i].clear();
+				m_rows[i][i] = 1.0f;
+			}
+			return *this;
+		}
+
+		inline matrix& set_scale_matrix(float s)
+		{
+			clear();
+			for (int i = 0; i < (R - 1); i++)
+				m_rows[i][i] = s;
+			m_rows[R - 1][C - 1] = 1.0f;
+			return *this;
+		}
+
+		inline matrix& set_scale_matrix(const row_vec& s)
+		{
+			clear();
+			for (uint32_t i = 0; i < R; i++)
+				m_rows[i][i] = s[i];
+			return *this;
+		}
+
+		inline matrix& set_scale_matrix(float x, float y)
+		{
+			set_identity_matrix();
+			m_rows[0].set_x(x);
+			m_rows[1].set_y(y);
+			return *this;
+		}
+
+		inline matrix& set_scale_matrix(float x, float y, float z)
+		{
+			set_identity_matrix();
+			m_rows[0].set_x(x);
+			m_rows[1].set_y(y);
+			m_rows[2].set_z(z);
+			return *this;
+		}
+
+		inline matrix& set_translate_matrix(const row_vec& s)
+		{
+			set_identity_matrix();
+			set_translate(s);
+			return *this;
+		}
+
+		inline matrix& set_translate_matrix(float x, float y)
+		{
+			set_identity_matrix();
+			set_translate(row_vec(x, y).as_point());
+			return *this;
+		}
+
+		inline matrix& set_translate_matrix(float x, float y, float z)
+		{
+			set_identity_matrix();
+			set_translate(row_vec(x, y, z).as_point());
+			return *this;
+		}
+
+		inline matrix get_transposed() const
+		{
+			static_assert(R == C);
+
+			matrix result;
+			for (uint32_t i = 0; i < R; i++)
+				for (uint32_t j = 0; j < C; j++)
+					result.m_rows[i][j] = m_rows[j][i];
+			return result;
+		}
+
+		inline matrix<C, R, T> get_transposed_nonsquare() const
+		{
+			matrix<C, R, T> result;
+			for (uint32_t i = 0; i < R; i++)
+				for (uint32_t j = 0; j < C; j++)
+					result[j][i] = m_rows[i][j];
+			return result;
+		}
+
+		inline matrix& transpose_in_place()
+		{
+			matrix result;
+			for (uint32_t i = 0; i < R; i++)
+				for (uint32_t j = 0; j < C; j++)
+					result.m_rows[i][j] = m_rows[j][i];
+			*this = result;
+			return *this;
+		}
+
+		// Frobenius Norm
+		T get_norm() const
+		{
+			T result = 0;
+
+			for (uint32_t i = 0; i < R; i++)
+				for (uint32_t j = 0; j < C; j++)
+					result += m_rows[i][j] * m_rows[i][j];
+
+			return static_cast<T>(sqrt(result));
+		}
+
+		inline matrix get_power(T p) const
+		{
+			matrix result;
+
+			for (uint32_t i = 0; i < R; i++)
+				for (uint32_t j = 0; j < C; j++)
+					result[i][j] = static_cast<T>(pow(m_rows[i][j], p));
+
+			return result;
+		}
+
+		inline matrix<1, R, T> numpy_dot(const matrix<1, C, T>& b) const
+		{
+			matrix<1, R, T> result;
+
+			for (uint32_t r = 0; r < R; r++)
+			{
+				T sum = 0;
+				for (uint32_t c = 0; c < C; c++)
+					sum += m_rows[r][c] * b[0][c];
+
+				result[0][r] = static_cast<T>(sum);
+			}
+
+			return result;
+		}
+
+		bool invert(matrix& result) const
+		{
+			static_assert(R == C);
+
+			result.set_identity_matrix();
+
+			matrix mat(*this);
+
+			for (uint32_t c = 0; c < C; c++)
+			{
+				uint32_t max_r = c;
+				for (uint32_t r = c + 1; r < R; r++)
+					if (fabs(mat[r][c]) > fabs(mat[max_r][c]))
+						max_r = r;
+
+				if (mat[max_r][c] == 0.0f)
+				{
+					result.set_identity_matrix();
+					return false;
+				}
+
+				std::swap(mat[c], mat[max_r]);
+				std::swap(result[c], result[max_r]);
+
+				result[c] /= mat[c][c];
+				mat[c] /= mat[c][c];
+
+				for (uint32_t row = 0; row < R; row++)
+				{
+					if (row != c)
+					{
+						const row_vec temp(mat[row][c]);
+						mat[row] -= row_vec::mul_components(mat[c], temp);
+						result[row] -= row_vec::mul_components(result[c], temp);
+					}
+				}
+			}
+
+			return true;
+		}
+
+		matrix& invert_in_place()
+		{
+			matrix result;
+			invert(result);
+			*this = result;
+			return *this;
+		}
+
+		matrix get_inverse() const
+		{
+			matrix result;
+			invert(result);
+			return result;
+		}
+
+		T get_det() const
+		{
+			static_assert(R == C);
+			return det_helper(*this, R);
+		}
+
+		bool equal_tol(const matrix& b, float tol) const
+		{
+			for (uint32_t r = 0; r < R; r++)
+				if (!row_vec::equal_tol(m_rows[r], b.m_rows[r], tol))
+					return false;
+			return true;
+		}
+
+		bool is_square() const
+		{
+			return R == C;
+		}
+
+		double get_trace() const
+		{
+			static_assert(is_square());
+
+			T total = 0;
+			for (uint32_t i = 0; i < R; i++)
+				total += (*this)(i, i);
+
+			return total;
+		}
+
+		void print() const
+		{
+			for (uint32_t r = 0; r < R; r++)
+			{
+				for (uint32_t c = 0; c < C; c++)
+					printf("%3.7f ", (*this)(r, c));
+				printf("\n");
+			}
+		}
+
+		// This method transforms a vec by a matrix (D3D-style: row vector on left).
+		// Confusingly, note that the data type is named "col_vec", but mathematically it's actually written as a row vector (of size equal to the # matrix rows, which is why it's called a "col_vec" in this class).
+		// 1xR * RxC = 1xC
+		// This dots against the matrix columns.
+		static inline row_vec transform(const col_vec& a, const matrix& b)
+		{
+			row_vec result(b[0] * a[0]);
+			for (uint32_t r = 1; r < R; r++)
+				result += b[r] * a[r];
+			return result;
+		}
+
+		// This method transforms a vec by a matrix (D3D-style: row vector on left).
+		// Last component of vec is assumed to be 1.
+		static inline row_vec transform_point(const col_vec& a, const matrix& b)
+		{
+			row_vec result(0);
+			for (int r = 0; r < (R - 1); r++)
+				result += b[r] * a[r];
+			result += b[R - 1];
+			return result;
+		}
+
+		// This method transforms a vec by a matrix (D3D-style: row vector on left).
+		// Last component of vec is assumed to be 0.
+		static inline row_vec transform_vector(const col_vec& a, const matrix& b)
+		{
+			row_vec result(0);
+			for (int r = 0; r < (R - 1); r++)
+				result += b[r] * a[r];
+			return result;
+		}
+
+		// This method transforms a vec by a matrix (D3D-style: row vector on left).
+		// Last component of vec is assumed to be 1.
+		static inline subcol_vec transform_point(const subcol_vec& a, const matrix& b)
+		{
+			subcol_vec result(0);
+			for (int r = 0; r < static_cast<int>(R); r++)
+			{
+				const T s = (r < subcol_vec::num_elements) ? a[r] : 1.0f;
+				for (int c = 0; c < static_cast<int>(C - 1); c++)
+					result[c] += b[r][c] * s;
+			}
+			return result;
+		}
+
+		// This method transforms a vec by a matrix (D3D-style: row vector on left).
+		// Last component of vec is assumed to be 0.
+		static inline subcol_vec transform_vector(const subcol_vec& a, const matrix& b)
+		{
+			subcol_vec result(0);
+			for (int r = 0; r < static_cast<int>(R - 1); r++)
+			{
+				const T s = a[r];
+				for (int c = 0; c < static_cast<int>(C - 1); c++)
+					result[c] += b[r][c] * s;
+			}
+			return result;
+		}
+
+		// Like transform() above, but the matrix is effectively transposed before the multiply.
+		static inline col_vec transform_transposed(const col_vec& a, const matrix& b)
+		{
+			static_assert(R == C);
+			col_vec result;
+			for (uint32_t r = 0; r < R; r++)
+				result[r] = b[r].dot(a);
+			return result;
+		}
+
+		// Like transform() above, but the matrix is effectively transposed before the multiply.
+		// Last component of vec is assumed to be 0.
+		static inline col_vec transform_vector_transposed(const col_vec& a, const matrix& b)
+		{
+			static_assert(R == C);
+			col_vec result;
+			for (uint32_t r = 0; r < R; r++)
+			{
+				T s = 0;
+				for (uint32_t c = 0; c < (C - 1); c++)
+					s += b[r][c] * a[c];
+
+				result[r] = s;
+			}
+			return result;
+		}
+
+		// This method transforms a vec by a matrix (D3D-style: row vector on left), but the matrix is effectively transposed before the multiply.
+		// Last component of vec is assumed to be 1.
+		static inline subcol_vec transform_point_transposed(const subcol_vec& a, const matrix& b)
+		{
+			static_assert(R == C);
+			subcol_vec result(0);
+			for (int r = 0; r < R; r++)
+			{
+				const T s = (r < subcol_vec::num_elements) ? a[r] : 1.0f;
+				for (int c = 0; c < (C - 1); c++)
+					result[c] += b[c][r] * s;
+			}
+			return result;
+		}
+
+		// This method transforms a vec by a matrix (D3D-style: row vector on left), but the matrix is effectively transposed before the multiply.
+		// Last component of vec is assumed to be 0.
+		static inline subcol_vec transform_vector_transposed(const subcol_vec& a, const matrix& b)
+		{
+			static_assert(R == C);
+			subcol_vec result(0);
+			for (int r = 0; r < static_cast<int>(R - 1); r++)
+			{
+				const T s = a[r];
+				for (int c = 0; c < static_cast<int>(C - 1); c++)
+					result[c] += b[c][r] * s;
+			}
+			return result;
+		}
+
+		// This method transforms a matrix by a vector (OGL style, col vector on the right).
+		// Note that the data type is named "row_vec", but mathematically it's actually written as a column vector (of size equal to the # matrix cols).
+		// RxC * Cx1 = Rx1
+		// This dots against the matrix rows.
+		static inline col_vec transform(const matrix& b, const row_vec& a)
+		{
+			col_vec result;
+			for (int r = 0; r < static_cast<int>(R); r++)
+				result[r] = b[r].dot(a);
+			return result;
+		}
+
+		// This method transforms a matrix by a vector (OGL style, col vector on the right), except the matrix is effectively transposed before the multiply.
+		// Note that the data type is named "row_vec", but mathematically it's actually written as a column vector (of size equal to the # matrix cols).
+		// RxC * Cx1 = Rx1
+		// This dots against the matrix cols.
+		static inline col_vec transform_transposed(const matrix& b, const row_vec& a)
+		{
+			static_assert(R == C);
+			row_vec result(b[0] * a[0]);
+			for (int r = 1; r < static_cast<int>(R); r++)
+				result += b[r] * a[r];
+			return col_vec(result);
+		}
+
+		static inline matrix& mul_components(matrix& result, const matrix& lhs, const matrix& rhs)
+		{
+			for (uint32_t r = 0; r < R; r++)
+				result[r] = row_vec::mul_components(lhs[r], rhs[r]);
+			return result;
+		}
+
+		static inline matrix& concat(matrix& lhs, const matrix& rhs)
+		{
+			return matrix_mul_helper(lhs, matrix(lhs), rhs);
+		}
+
+		inline matrix& concat_in_place(const matrix& rhs)
+		{
+			return concat(*this, rhs);
+		}
+
+		static inline matrix& multiply(matrix& result, const matrix& lhs, const matrix& rhs)
+		{
+			matrix temp;
+			matrix* pResult = ((&result == &lhs) || (&result == &rhs)) ? &temp : &result;
+
+			matrix_mul_helper(*pResult, lhs, rhs);
+			if (pResult != &result)
+				result = *pResult;
+
+			return result;
+		}
+
+		static matrix make_zero_matrix()
+		{
+			matrix result;
+			result.clear();
+			return result;
+		}
+
+		static matrix make_identity_matrix()
+		{
+			matrix result;
+			result.set_identity_matrix();
+			return result;
+		}
+
+		static matrix make_translate_matrix(const row_vec& t)
+		{
+			return matrix(basisu::cIdentity).set_translate(t);
+		}
+
+		static matrix make_translate_matrix(float x, float y)
+		{
+			return matrix(basisu::cIdentity).set_translate_matrix(x, y);
+		}
+
+		static matrix make_translate_matrix(float x, float y, float z)
+		{
+			return matrix(basisu::cIdentity).set_translate_matrix(x, y, z);
+		}
+
+		static inline matrix make_scale_matrix(float s)
+		{
+			return matrix().set_scale_matrix(s);
+		}
+
+		static inline matrix make_scale_matrix(const row_vec& s)
+		{
+			return matrix().set_scale_matrix(s);
+		}
+
+		static inline matrix make_scale_matrix(float x, float y)
+		{
+			static_assert(R >= 3 && C >= 3);
+			matrix result;
+			result.set_identity_matrix();
+			result.m_rows[0][0] = x;
+			result.m_rows[1][1] = y;
+			return result;
+		}
+
+		static inline matrix make_scale_matrix(float x, float y, float z)
+		{
+			static_assert(R >= 4 && C >= 4);
+			matrix result;
+			result.set_identity_matrix();
+			result.m_rows[0][0] = x;
+			result.m_rows[1][1] = y;
+			result.m_rows[2][2] = z;
+			return result;
+		}
+
+		// Helpers derived from Graphics Gems 1 and 2 (Matrices and Transformations, Ronald N. Goldman)
+		static matrix make_rotate_matrix(const vec<3, T>& axis, T ang)
+		{
+			static_assert(R >= 3 && C >= 3);
+
+			vec<3, T> norm_axis(axis.get_normalized());
+
+			double cos_a = cos(ang);
+			double inv_cos_a = 1.0f - cos_a;
+
+			double sin_a = sin(ang);
+
+			const T x = norm_axis[0];
+			const T y = norm_axis[1];
+			const T z = norm_axis[2];
+
+			const double x2 = norm_axis[0] * norm_axis[0];
+			const double y2 = norm_axis[1] * norm_axis[1];
+			const double z2 = norm_axis[2] * norm_axis[2];
+
+			matrix result;
+			result.set_identity_matrix();
+
+			result[0][0] = (T)((inv_cos_a * x2) + cos_a);
+			result[1][0] = (T)((inv_cos_a * x * y) + (sin_a * z));
+			result[2][0] = (T)((inv_cos_a * x * z) - (sin_a * y));
+
+			result[0][1] = (T)((inv_cos_a * x * y) - (sin_a * z));
+			result[1][1] = (T)((inv_cos_a * y2) + cos_a);
+			result[2][1] = (T)((inv_cos_a * y * z) + (sin_a * x));
+
+			result[0][2] = (T)((inv_cos_a * x * z) + (sin_a * y));
+			result[1][2] = (T)((inv_cos_a * y * z) - (sin_a * x));
+			result[2][2] = (T)((inv_cos_a * z2) + cos_a);
+
+			return result;
+		}
+
+		static inline matrix make_rotate_matrix(T ang)
+		{
+			static_assert(R >= 2 && C >= 2);
+
+			matrix ret(basisu::cIdentity);
+
+			const T sin_a = static_cast<T>(sin(ang));
+			const T cos_a = static_cast<T>(cos(ang));
+
+			ret[0][0] = +cos_a;
+			ret[0][1] = -sin_a;
+			ret[1][0] = +sin_a;
+			ret[1][1] = +cos_a;
+
+			return ret;
+		}
+
+		static inline matrix make_rotate_matrix(uint32_t axis, T ang)
+		{
+			vec<3, T> axis_vec;
+			axis_vec.clear();
+			axis_vec[axis] = 1.0f;
+			return make_rotate_matrix(axis_vec, ang);
+		}
+
+		static inline matrix make_cross_product_matrix(const vec<3, scalar_type>& c)
+		{
+			static_assert((num_rows >= 3) && (num_cols >= 3));
+			matrix ret(basisu::cClear);
+			ret[0][1] = c[2];
+			ret[0][2] = -c[1];
+			ret[1][0] = -c[2];
+			ret[1][2] = c[0];
+			ret[2][0] = c[1];
+			ret[2][1] = -c[0];
+			return ret;
+		}
+
+		static inline matrix make_reflection_matrix(const vec<4, scalar_type>& n, const vec<4, scalar_type>& q)
+		{
+			static_assert((num_rows == 4) && (num_cols == 4));
+			matrix ret;
+			assert(n.is_vector() && q.is_vector());
+			ret = make_identity_matrix() - 2.0f * make_tensor_product_matrix(n, n);
+			ret.set_translate((2.0f * q.dot(n) * n).as_point());
+			return ret;
+		}
+
+		static inline matrix make_tensor_product_matrix(const row_vec& v, const row_vec& w)
+		{
+			matrix ret;
+			for (int r = 0; r < num_rows; r++)
+				ret[r] = row_vec::mul_components(v.broadcast(r), w);
+			return ret;
+		}
+
+		static inline matrix make_uniform_scaling_matrix(const vec<4, scalar_type>& q, scalar_type c)
+		{
+			static_assert((num_rows == 4) && (num_cols == 4));
+			assert(q.is_vector());
+			matrix ret;
+			ret = c * make_identity_matrix();
+			ret.set_translate(((1.0f - c) * q).as_point());
+			return ret;
+		}
+
+		static inline matrix make_nonuniform_scaling_matrix(const vec<4, scalar_type>& q, scalar_type c, const vec<4, scalar_type>& w)
+		{
+			static_assert((num_rows == 4) && (num_cols == 4));
+			assert(q.is_vector() && w.is_vector());
+			matrix ret;
+			ret = make_identity_matrix() - (1.0f - c) * make_tensor_product_matrix(w, w);
+			ret.set_translate(((1.0f - c) * q.dot(w) * w).as_point());
+			return ret;
+		}
+
+		// n = normal of plane, q = point on plane
+		static inline matrix make_ortho_projection_matrix(const vec<4, scalar_type>& n, const vec<4, scalar_type>& q)
+		{
+			assert(n.is_vector() && q.is_vector());
+			matrix ret;
+			ret = make_identity_matrix() - make_tensor_product_matrix(n, n);
+			ret.set_translate((q.dot(n) * n).as_point());
+			return ret;
+		}
+
+		static inline matrix make_parallel_projection(const vec<4, scalar_type>& n, const vec<4, scalar_type>& q, const vec<4, scalar_type>& w)
+		{
+			assert(n.is_vector() && q.is_vector() && w.is_vector());
+			matrix ret;
+			ret = make_identity_matrix() - (make_tensor_product_matrix(n, w) / (w.dot(n)));
+			ret.set_translate(((q.dot(n) / w.dot(n)) * w).as_point());
+			return ret;
+		}
+
+	protected:
+		row_vec m_rows[R];
+
+		static T det_helper(const matrix& a, uint32_t n)
+		{
+			// Algorithm ported from Numerical Recipes in C.
+			T d;
+			matrix m;
+			if (n == 2)
+				d = a(0, 0) * a(1, 1) - a(1, 0) * a(0, 1);
+			else
+			{
+				d = 0;
+				for (uint32_t j1 = 1; j1 <= n; j1++)
+				{
+					for (uint32_t i = 2; i <= n; i++)
+					{
+						int j2 = 1;
+						for (uint32_t j = 1; j <= n; j++)
+						{
+							if (j != j1)
+							{
+								m(i - 2, j2 - 1) = a(i - 1, j - 1);
+								j2++;
+							}
+						}
+					}
+					d += (((1 + j1) & 1) ? -1.0f : 1.0f) * a(1 - 1, j1 - 1) * det_helper(m, n - 1);
+				}
+			}
+			return d;
+		}
+	};
+
+	typedef matrix<2, 2, float> matrix22F;
+	typedef matrix<2, 2, double> matrix22D;
+
+	typedef matrix<3, 3, float> matrix33F;
+	typedef matrix<3, 3, double> matrix33D;
+
+	typedef matrix<4, 4, float> matrix44F;
+	typedef matrix<4, 4, double> matrix44D;
+
+	typedef matrix<8, 8, float> matrix88F;
+
+	// These helpers create good old D3D-style matrices.
+	inline matrix44F matrix44F_make_perspective_offcenter_lh(float l, float r, float b, float t, float nz, float fz)
+	{
+		float two_nz = 2.0f * nz;
+		float one_over_width = 1.0f / (r - l);
+		float one_over_height = 1.0f / (t - b);
+
+		matrix44F view_to_proj;
+		view_to_proj[0].set(two_nz * one_over_width, 0.0f, 0.0f, 0.0f);
+		view_to_proj[1].set(0.0f, two_nz * one_over_height, 0.0f, 0.0f);
+		view_to_proj[2].set(-(l + r) * one_over_width, -(t + b) * one_over_height, fz / (fz - nz), 1.0f);
+		view_to_proj[3].set(0.0f, 0.0f, -view_to_proj[2][2] * nz, 0.0f);
+		return view_to_proj;
+	}
+
+	// fov_y: full Y field of view (radians)
+	// aspect: viewspace width/height
+	inline matrix44F matrix44F_make_perspective_fov_lh(float fov_y, float aspect, float nz, float fz)
+	{
+		double sin_fov = sin(0.5f * fov_y);
+		double cos_fov = cos(0.5f * fov_y);
+
+		float y_scale = static_cast<float>(cos_fov / sin_fov);
+		float x_scale = static_cast<float>(y_scale / aspect);
+
+		matrix44F view_to_proj;
+		view_to_proj[0].set(x_scale, 0, 0, 0);
+		view_to_proj[1].set(0, y_scale, 0, 0);
+		view_to_proj[2].set(0, 0, fz / (fz - nz), 1);
+		view_to_proj[3].set(0, 0, -nz * fz / (fz - nz), 0);
+		return view_to_proj;
+	}
+
+	inline matrix44F matrix44F_make_ortho_offcenter_lh(float l, float r, float b, float t, float nz, float fz)
+	{
+		matrix44F view_to_proj;
+		view_to_proj[0].set(2.0f / (r - l), 0.0f, 0.0f, 0.0f);
+		view_to_proj[1].set(0.0f, 2.0f / (t - b), 0.0f, 0.0f);
+		view_to_proj[2].set(0.0f, 0.0f, 1.0f / (fz - nz), 0.0f);
+		view_to_proj[3].set((l + r) / (l - r), (t + b) / (b - t), nz / (nz - fz), 1.0f);
+		return view_to_proj;
+	}
+
+	inline matrix44F matrix44F_make_ortho_lh(float w, float h, float nz, float fz)
+	{
+		return matrix44F_make_ortho_offcenter_lh(-w * .5f, w * .5f, -h * .5f, h * .5f, nz, fz);
+	}
+
+	inline matrix44F matrix44F_make_projection_to_screen_d3d(int x, int y, int w, int h, float min_z, float max_z)
+	{
+		matrix44F proj_to_screen;
+		proj_to_screen[0].set(w * .5f, 0.0f, 0.0f, 0.0f);
+		proj_to_screen[1].set(0, h * -.5f, 0.0f, 0.0f);
+		proj_to_screen[2].set(0, 0.0f, max_z - min_z, 0.0f);
+		proj_to_screen[3].set(x + w * .5f, y + h * .5f, min_z, 1.0f);
+		return proj_to_screen;
+	}
+
+	inline matrix44F matrix44F_make_lookat_lh(const vec3F& camera_pos, const vec3F& look_at, const vec3F& camera_up, float camera_roll_ang_in_radians)
+	{
+		vec4F col2(look_at - camera_pos);
+		assert(col2.is_vector());
+		if (col2.normalize() == 0.0f)
+			col2.set(0, 0, 1, 0);
+
+		vec4F col1(camera_up);
+		assert(col1.is_vector());
+		if (!col2[0] && !col2[2])
+			col1.set(-1.0f, 0.0f, 0.0f, 0.0f);
+
+		if ((col1.dot(col2)) > .9999f)
+			col1.set(0.0f, 1.0f, 0.0f, 0.0f);
+
+		vec4F col0(vec4F::cross3(col1, col2).normalize_in_place());
+		col1 = vec4F::cross3(col2, col0).normalize_in_place();
+
+		matrix44F rotm(matrix44F::make_identity_matrix());
+		rotm.set_col(0, col0);
+		rotm.set_col(1, col1);
+		rotm.set_col(2, col2);
+		return matrix44F::make_translate_matrix(-camera_pos[0], -camera_pos[1], -camera_pos[2]) * rotm * matrix44F::make_rotate_matrix(2, camera_roll_ang_in_radians);
+	}
+
+	template<typename R> R matrix_NxN_create_DCT()
+	{
+		assert(R::num_rows == R::num_cols);
+
+		const uint32_t N = R::num_cols;
+
+		R result;
+		for (uint32_t k = 0; k < N; k++)
+		{
+			for (uint32_t n = 0; n < N; n++)
+			{
+				double f;
+
+				if (!k)
+					f = 1.0f / sqrt(float(N));
+				else
+					f = sqrt(2.0f / float(N)) * cos((basisu::cPiD * (2.0f * float(n) + 1.0f) * float(k)) / (2.0f * float(N)));
+
+				result(k, n) = static_cast<typename R::scalar_type>(f);
+			}
+		}
+
+		return result;
+	}
+
+	template<typename R> R matrix_NxN_DCT(const R& a, const R& dct)
+	{
+		R temp;
+		matrix_mul_helper<R, R, R>(temp, dct, a);
+		R result;
+		matrix_mul_helper_transpose_rhs<R, R, R>(result, temp, dct);
+		return result;
+	}
+
+	template<typename R> R matrix_NxN_IDCT(const R& b, const R& dct)
+	{
+		R temp;
+		matrix_mul_helper_transpose_lhs<R, R, R>(temp, dct, b);
+		R result;
+		matrix_mul_helper<R, R, R>(result, temp, dct);
+		return result;
+	}
+
+	template<typename X, typename Y> matrix<X::num_rows* Y::num_rows, X::num_cols* Y::num_cols, typename X::scalar_type> matrix_kronecker_product(const X& a, const Y& b)
+	{
+		matrix<X::num_rows* Y::num_rows, X::num_cols* Y::num_cols, typename X::scalar_type> result;
+
+		for (uint32_t r = 0; r < X::num_rows; r++)
+		{
+			for (uint32_t c = 0; c < X::num_cols; c++)
+			{
+				for (uint32_t i = 0; i < Y::num_rows; i++)
+					for (uint32_t j = 0; j < Y::num_cols; j++)
+						result(r * Y::num_rows + i, c * Y::num_cols + j) = a(r, c) * b(i, j);
+			}
+		}
+
+		return result;
+	}
+
+	template<typename X, typename Y> matrix<X::num_rows + Y::num_rows, X::num_cols, typename X::scalar_type> matrix_combine_vertically(const X& a, const Y& b)
+	{
+		matrix<X::num_rows + Y::num_rows, X::num_cols, typename X::scalar_type> result;
+
+		for (uint32_t r = 0; r < X::num_rows; r++)
+			for (uint32_t c = 0; c < X::num_cols; c++)
+				result(r, c) = a(r, c);
+
+		for (uint32_t r = 0; r < Y::num_rows; r++)
+			for (uint32_t c = 0; c < Y::num_cols; c++)
+				result(r + X::num_rows, c) = b(r, c);
+
+		return result;
+	}
+
+	inline matrix88F get_haar8()
+	{
+		matrix22F haar2(
+			1, 1,
+			1, -1);
+		matrix22F i2(
+			1, 0,
+			0, 1);
+		matrix44F i4(
+			1, 0, 0, 0,
+			0, 1, 0, 0,
+			0, 0, 1, 0,
+			0, 0, 0, 1);
+
+		matrix<1, 2, float> b0; b0(0, 0) = 1; b0(0, 1) = 1;
+		matrix<1, 2, float> b1; b1(0, 0) = 1.0f; b1(0, 1) = -1.0f;
+
+		matrix<2, 4, float> haar4_0 = matrix_kronecker_product(haar2, b0);
+		matrix<2, 4, float> haar4_1 = matrix_kronecker_product(i2, b1);
+
+		matrix<4, 4, float> haar4 = matrix_combine_vertically(haar4_0, haar4_1);
+
+		matrix<4, 8, float> haar8_0 = matrix_kronecker_product(haar4, b0);
+		matrix<4, 8, float> haar8_1 = matrix_kronecker_product(i4, b1);
+
+		haar8_0[2] *= sqrtf(2);
+		haar8_0[3] *= sqrtf(2);
+		haar8_1 *= 2.0f;
+
+		matrix<8, 8, float> haar8 = matrix_combine_vertically(haar8_0, haar8_1);
+
+		return haar8;
+	}
+
+	inline matrix44F get_haar4()
+	{
+		const float sqrt2 = 1.4142135623730951f;
+
+		return matrix44F(
+			.5f * 1, .5f * 1, .5f * 1, .5f * 1,
+			.5f * 1, .5f * 1, .5f * -1, .5f * -1,
+			.5f * sqrt2, .5f * -sqrt2, 0, 0,
+			0, 0, .5f * sqrt2, .5f * -sqrt2);
+	}
+
+	template<typename T>
+	inline matrix<2, 2, T> get_inverse_2x2(const matrix<2, 2, T>& m)
+	{
+		double a = m[0][0];
+		double b = m[0][1];
+		double c = m[1][0];
+		double d = m[1][1];
+
+		double det = a * d - b * c;
+		if (det != 0.0f)
+			det = 1.0f / det;
+
+		matrix<2, 2, T> result;
+		result[0][0] = static_cast<T>(d * det);
+		result[0][1] = static_cast<T>(-b * det);
+		result[1][0] = static_cast<T>(-c * det);
+		result[1][1] = static_cast<T>(a * det);
+		return result;
+	}
+
+} // namespace bu_math
+
+namespace basisu
+{
+	class tracked_stat
+	{
+	public:
+		tracked_stat() { clear(); }
+
+		inline void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
+
+		inline void update(int32_t val) { m_num++; m_total += val; m_total2 += val * val; }
+
+		inline tracked_stat& operator += (uint32_t val) { update(val); return *this; }
+
+		inline uint32_t get_number_of_values() { return m_num; }
+		inline uint64_t get_total() const { return m_total; }
+		inline uint64_t get_total2() const { return m_total2; }
+
+		inline float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; };
+		inline float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
+		inline float get_variance() const { float s = get_std_dev(); return s * s; }
+
+	private:
+		uint32_t m_num;
+		int64_t m_total;
+		int64_t m_total2;
+	};
+
+	class tracked_stat_dbl
+	{
+	public:
+		tracked_stat_dbl() { clear(); }
+
+		inline void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
+
+		inline void update(double val) { m_num++; m_total += val; m_total2 += val * val; }
+
+		inline tracked_stat_dbl& operator += (double val) { update(val); return *this; }
+
+		inline uint64_t get_number_of_values() { return m_num; }
+		inline double get_total() const { return m_total; }
+		inline double get_total2() const { return m_total2; }
+
+		inline double get_average() const { return m_num ? m_total / (double)m_num : 0.0f; };
+		inline double get_std_dev() const { return m_num ? sqrt((double)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
+		inline double get_variance() const { double s = get_std_dev(); return s * s; }
+
+	private:
+		uint64_t m_num;
+		double m_total;
+		double m_total2;
+	};
+
+	template<typename FloatType>
+	struct stats
+	{
+		uint32_t m_n;
+		FloatType m_total, m_total_sq;		// total, total of squares values
+		FloatType m_avg, m_avg_sq;			// mean, mean of the squared values
+		FloatType m_rms;					// sqrt(m_avg_sq)
+		FloatType m_std_dev, m_var;			// population standard deviation and variance
+		FloatType m_mad;					// mean absolute deviation
+		FloatType m_min, m_max, m_range;	// min and max values, and max-min
+		FloatType m_len;					// length of values as a vector (Euclidean norm or L2 norm)
+		FloatType m_coeff_of_var;			// coefficient of variation (std_dev/mean), High CV: Indicates greater variability relative to the mean, meaning the data values are more spread out, 
+											// Low CV : Indicates less variability relative to the mean, meaning the data values are more consistent.
+		
+		FloatType m_skewness;				// Skewness = 0: The data is perfectly symmetric around the mean, 
+											// Skewness > 0: The data is positively skewed (right-skewed), 
+											// Skewness < 0: The data is negatively skewed (left-skewed)
+											// 0-.5 approx. symmetry, .5-1 moderate skew, >= 1 highly skewed
+		
+		FloatType m_kurtosis;				// Excess Kurtosis: Kurtosis = 0: The distribution has normal kurtosis (mesokurtic)
+											// Kurtosis > 0: The distribution is leptokurtic, with heavy tails and a sharp peak
+											// Kurtosis < 0: The distribution is platykurtic, with light tails and a flatter peak
+
+		bool m_any_zero;
+
+		FloatType m_median;
+		uint32_t m_median_index;
+
+		stats() 
+		{ 
+			clear(); 
+		}
+
+		void clear()
+		{
+			m_n = 0;
+			m_total = 0, m_total_sq = 0;
+			m_avg = 0, m_avg_sq = 0;
+			m_rms = 0;
+			m_std_dev = 0, m_var = 0;
+			m_mad = 0;
+			m_min = BIG_FLOAT_VAL, m_max = -BIG_FLOAT_VAL; m_range = 0.0f;
+			m_len = 0;
+			m_coeff_of_var = 0;
+			m_skewness = 0;
+			m_kurtosis = 0;
+			m_any_zero = false;
+			
+			m_median = 0;
+			m_median_index = 0;
+		}
+
+		template<typename T>
+		void calc_median(uint32_t n, const T* pVals, uint32_t stride = 1)
+		{
+			m_median = 0;
+			m_median_index = 0;
+
+			if (!n)
+				return;
+
+			basisu::vector< std::pair<T, uint32_t> > vals(n);
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				vals[i].first = pVals[i * stride];
+				vals[i].second = i;
+			}
+
+			std::sort(vals.begin(), vals.end(), [](const std::pair<T, uint32_t>& a, const std::pair<T, uint32_t>& b) {
+				return a.first < b.first;
+				});
+
+			m_median = vals[n / 2].first;
+			if ((n & 1) == 0)
+				m_median = (m_median + vals[(n / 2) - 1].first) * .5f;
+
+			m_median_index = vals[n / 2].second;
+		}
+
+		template<typename T>
+		void calc(uint32_t n, const T* pVals, uint32_t stride = 1, bool calc_median_flag = false)
+		{
+			clear();
+						
+			if (!n)
+				return;
+
+			if (calc_median_flag)
+				calc_median(n, pVals, stride);
+
+			m_n = n;
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				FloatType v = (FloatType)pVals[i * stride];
+
+				if (v == 0.0f)
+					m_any_zero = true;
+				
+				m_total += v;
+				m_total_sq += v * v;
+				
+				if (!i)
+				{
+					m_min = v;
+					m_max = v;
+				}
+				else
+				{
+					m_min = minimum(m_min, v);
+					m_max = maximum(m_max, v);
+				}
+			}
+
+			m_range = m_max - m_min;
+
+			m_len = sqrt(m_total_sq);
+
+			const FloatType nd = (FloatType)n;
+
+			m_avg = m_total / nd;
+			m_avg_sq = m_total_sq / nd;
+			m_rms = sqrt(m_avg_sq);
+			
+			for (uint32_t i = 0; i < n; i++)
+			{
+				FloatType v = (FloatType)pVals[i * stride];
+				FloatType d = v - m_avg;
+				
+				const FloatType d2 = d * d;
+				const FloatType d3 = d2 * d;
+				const FloatType d4 = d3 * d;
+
+				m_var += d2;
+				m_mad += fabs(d);
+				m_skewness += d3;
+				m_kurtosis += d4;
+			}
+
+			m_var /= nd;
+			m_mad /= nd;
+
+			m_std_dev = sqrt(m_var);
+
+			m_coeff_of_var = (m_avg != 0.0f) ? (m_std_dev / fabs(m_avg)) : 0.0f;
+
+			FloatType k3 = m_std_dev * m_std_dev * m_std_dev;
+			FloatType k4 = k3 * m_std_dev;
+			m_skewness = (k3 != 0.0f) ? ((m_skewness / nd) / k3) : 0.0f;
+			m_kurtosis = (k4 != 0.0f) ? (((m_kurtosis / nd) / k4) - 3.0f) : 0.0f;
+		}
+
+		// Only compute average, variance and standard deviation.
+		template<typename T>
+		void calc_simplified(uint32_t n, const T* pVals, uint32_t stride = 1)
+		{
+			clear();
+
+			if (!n)
+				return;
+
+			m_n = n;
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				FloatType v = (FloatType)pVals[i * stride];
+
+				m_total += v;
+			}
+						
+			const FloatType nd = (FloatType)n;
+
+			m_avg = m_total / nd;
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				FloatType v = (FloatType)pVals[i * stride];
+				FloatType d = v - m_avg;
+
+				const FloatType d2 = d * d;
+
+				m_var += d2;
+			}
+
+			m_var /= nd;
+			m_std_dev = sqrt(m_var);
+		}
+	};
+
+	template<typename FloatType>
+	struct comparative_stats
+	{
+		FloatType m_cov;					// covariance
+		FloatType m_pearson;				// Pearson Correlation Coefficient (r) [-1,1]
+		FloatType m_mse;					// mean squared error
+		FloatType m_rmse;					// root mean squared error
+		FloatType m_mae;					// mean abs error
+		FloatType m_rmsle;					// root mean squared log error
+		FloatType m_euclidean_dist;			// euclidean distance between values as vectors
+		FloatType m_cosine_sim;				// normalized dot products of values as vectors
+		FloatType m_min_diff, m_max_diff;	// minimum/maximum abs difference between values
+				
+		comparative_stats()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_cov = 0;
+			m_pearson = 0;
+			m_mse = 0;
+			m_rmse = 0;
+			m_mae = 0;
+			m_rmsle = 0;
+			m_euclidean_dist = 0;
+			m_cosine_sim = 0;
+			m_min_diff = 0;
+			m_max_diff = 0;
+		}
+
+		template<typename T>
+		void calc(uint32_t n, const T* pA, const T* pB, uint32_t a_stride = 1, uint32_t b_stride = 1, const stats<FloatType> *pA_stats = nullptr, const stats<FloatType> *pB_stats = nullptr)
+		{
+			clear();
+			if (!n)
+				return;
+						
+			stats<FloatType> temp_a_stats;
+			if (!pA_stats)
+			{
+				pA_stats = &temp_a_stats;
+				temp_a_stats.calc(n, pA, a_stride);
+			}
+
+			stats<FloatType> temp_b_stats;
+			if (!pB_stats)
+			{
+				pB_stats = &temp_b_stats;
+				temp_b_stats.calc(n, pB, b_stride);
+			}
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				const FloatType fa = (FloatType)pA[i * a_stride];
+				const FloatType fb = (FloatType)pB[i * b_stride];
+								
+				if ((pA_stats->m_min >= 0.0f) && (pB_stats->m_min >= 0.0f))
+				{
+					const FloatType ld = log(fa + 1.0f) - log(fb + 1.0f);
+					m_rmsle += ld * ld;
+				}
+
+				const FloatType diff = fa - fb;
+				const FloatType abs_diff = fabs(diff);
+				
+				m_mse += diff * diff;
+				m_mae += abs_diff;
+
+				m_min_diff = i ? minimum(m_min_diff, abs_diff) : abs_diff;
+				m_max_diff = maximum(m_max_diff, abs_diff);
+
+				const FloatType da = fa - pA_stats->m_avg;
+				const FloatType db = fb - pB_stats->m_avg;
+				m_cov += da * db;
+
+				m_cosine_sim += fa * fb;
+			}
+
+			const FloatType nd = (FloatType)n;
+			
+			m_euclidean_dist = sqrt(m_mse);
+
+			m_mse /= nd;
+			m_rmse = sqrt(m_mse);
+
+			m_mae /= nd;
+
+			m_cov /= nd;
+			
+			FloatType dv = (pA_stats->m_std_dev * pB_stats->m_std_dev);
+			if (dv != 0.0f)
+				m_pearson = m_cov / dv;
+
+			if ((pA_stats->m_min >= 0.0) && (pB_stats->m_min >= 0.0f))
+				m_rmsle = sqrt(m_rmsle / nd);
+
+			FloatType c = pA_stats->m_len * pB_stats->m_len;
+			if (c != 0.0f)
+				m_cosine_sim /= c;
+			else
+				m_cosine_sim = 0.0f;
+		}
+
+		// Only computes Pearson, cov, mse, rmse, Euclidean distance
+		template<typename T>
+		void calc_pearson(uint32_t n, const T* pA, const T* pB, uint32_t a_stride = 1, uint32_t b_stride = 1, const stats<FloatType>* pA_stats = nullptr, const stats<FloatType>* pB_stats = nullptr)
+		{
+			clear();
+			if (!n)
+				return;
+
+			stats<FloatType> temp_a_stats;
+			if (!pA_stats)
+			{
+				pA_stats = &temp_a_stats;
+				temp_a_stats.calc(n, pA, a_stride);
+			}
+
+			stats<FloatType> temp_b_stats;
+			if (!pB_stats)
+			{
+				pB_stats = &temp_b_stats;
+				temp_b_stats.calc(n, pB, b_stride);
+			}
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				const FloatType fa = (FloatType)pA[i * a_stride];
+				const FloatType fb = (FloatType)pB[i * b_stride];
+
+				const FloatType diff = fa - fb;
+
+				m_mse += diff * diff;
+
+				const FloatType da = fa - pA_stats->m_avg;
+				const FloatType db = fb - pB_stats->m_avg;
+				m_cov += da * db;
+			}
+
+			const FloatType nd = (FloatType)n;
+
+			m_euclidean_dist = sqrt(m_mse);
+
+			m_mse /= nd;
+			m_rmse = sqrt(m_mse);
+
+			m_cov /= nd;
+
+			FloatType dv = (pA_stats->m_std_dev * pB_stats->m_std_dev);
+			if (dv != 0.0f)
+				m_pearson = m_cov / dv;
+		}
+
+		// Only computes MSE, RMSE, eclidiean distance, and covariance.
+		template<typename T>
+		void calc_simplified(uint32_t n, const T* pA, const T* pB, uint32_t a_stride = 1, uint32_t b_stride = 1, const stats<FloatType>* pA_stats = nullptr, const stats<FloatType>* pB_stats = nullptr)
+		{
+			clear();
+			if (!n)
+				return;
+
+			stats<FloatType> temp_a_stats;
+			if (!pA_stats)
+			{
+				pA_stats = &temp_a_stats;
+				temp_a_stats.calc(n, pA, a_stride);
+			}
+
+			stats<FloatType> temp_b_stats;
+			if (!pB_stats)
+			{
+				pB_stats = &temp_b_stats;
+				temp_b_stats.calc(n, pB, b_stride);
+			}
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				const FloatType fa = (FloatType)pA[i * a_stride];
+				const FloatType fb = (FloatType)pB[i * b_stride];
+
+				const FloatType diff = fa - fb;
+				
+				m_mse += diff * diff;
+				
+				const FloatType da = fa - pA_stats->m_avg;
+				const FloatType db = fb - pB_stats->m_avg;
+				m_cov += da * db;
+			}
+
+			const FloatType nd = (FloatType)n;
+
+			m_euclidean_dist = sqrt(m_mse);
+
+			m_mse /= nd;
+			m_rmse = sqrt(m_mse);
+						
+			m_cov /= nd;
+		}
+
+		// Only computes covariance.
+		template<typename T>
+		void calc_cov(uint32_t n, const T* pA, const T* pB, uint32_t a_stride = 1, uint32_t b_stride = 1, const stats<FloatType>* pA_stats = nullptr, const stats<FloatType>* pB_stats = nullptr)
+		{
+			clear();
+			if (!n)
+				return;
+
+			stats<FloatType> temp_a_stats;
+			if (!pA_stats)
+			{
+				pA_stats = &temp_a_stats;
+				temp_a_stats.calc(n, pA, a_stride);
+			}
+
+			stats<FloatType> temp_b_stats;
+			if (!pB_stats)
+			{
+				pB_stats = &temp_b_stats;
+				temp_b_stats.calc(n, pB, b_stride);
+			}
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				const FloatType fa = (FloatType)pA[i * a_stride];
+				const FloatType fb = (FloatType)pB[i * b_stride];
+
+				const FloatType da = fa - pA_stats->m_avg;
+				const FloatType db = fb - pB_stats->m_avg;
+				m_cov += da * db;
+			}
+
+			const FloatType nd = (FloatType)n;
+
+			m_cov /= nd;
+		}
+	};
+		
+	class stat_history
+	{
+	public:
+		stat_history(uint32_t size)
+		{
+			init(size);
+		}
+
+		void init(uint32_t size)
+		{
+			clear();
+
+			m_samples.reserve(size);
+			m_samples.resize(0);
+			m_max_samples = size;
+		}
+
+		inline void clear()
+		{
+			m_samples.resize(0);
+			m_max_samples = 0;
+		}
+
+		inline void update(double val)
+		{
+			m_samples.push_back(val);
+
+			if (m_samples.size() > m_max_samples)
+				m_samples.erase_index(0);
+		}
+
+		inline size_t size()
+		{
+			return m_samples.size();
+		}
+
+		struct stats
+		{
+			double m_avg = 0;
+			double m_std_dev = 0;
+			double m_var = 0;
+			double m_mad = 0;
+			double m_min_val = 0;
+			double m_max_val = 0;
+
+			void clear()
+			{
+				basisu::clear_obj(*this);
+			}
+		};
+
+		inline void get_stats(stats& s)
+		{
+			s.clear();
+
+			if (m_samples.empty())
+				return;
+
+			double total = 0, total2 = 0;
+
+			for (size_t i = 0; i < m_samples.size(); i++)
+			{
+				const double v = m_samples[i];
+
+				total += v;
+				total2 += v * v;
+
+				if (!i)
+				{
+					s.m_min_val = v;
+					s.m_max_val = v;
+				}
+				else
+				{
+					s.m_min_val = basisu::minimum<double>(s.m_min_val, v);
+					s.m_max_val = basisu::maximum<double>(s.m_max_val, v);
+				}
+			}
+
+			const double n = (double)m_samples.size();
+
+			s.m_avg = total / n;
+			s.m_std_dev = sqrt((n * total2 - total * total)) / n;
+			s.m_var = (n * total2 - total * total) / (n * n);
+
+			double sc = 0;
+			for (size_t i = 0; i < m_samples.size(); i++)
+			{
+				const double v = m_samples[i];
+				s.m_mad += fabs(v - s.m_avg);
+
+				sc += basisu::square(v - s.m_avg);
+			}
+			sc = sqrt(sc / n);
+
+			s.m_mad /= n;
+		}
+
+	private:
+		uint32_t m_max_samples;
+		basisu::vector<double> m_samples;
+	};
+
+	// bfloat16 helpers, see:
+	// https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
+
+	typedef union
+	{
+		uint32_t u;
+		float f;
+	} float32_union;
+
+	typedef uint16_t bfloat16;
+
+	inline float bfloat16_to_float(bfloat16 bfloat16)
+	{
+		float32_union float_union;
+		float_union.u = ((uint32_t)bfloat16) << 16;
+		return float_union.f;
+	}
+
+	inline bfloat16 float_to_bfloat16(float input, bool round_flag = true)
+	{
+		float32_union float_union;
+		float_union.f = input;
+
+		uint32_t exponent = (float_union.u >> 23) & 0xFF;
+
+		// Check if the number is denormalized in float32 (exponent == 0)
+		if (exponent == 0)
+		{
+			// Handle denormalized float32 as zero in bfloat16
+			return 0x0000;
+		}
+
+		// Extract the top 16 bits (sign, exponent, and 7 most significant bits of the mantissa)
+		uint32_t upperBits = float_union.u >> 16;
+
+		if (round_flag)
+		{
+			// Check the most significant bit of the lower 16 bits for rounding
+			uint32_t lowerBits = float_union.u & 0xFFFF;
+
+			// Round to nearest or even
+			if ((lowerBits & 0x8000) && 
+				((lowerBits > 0x8000) || ((lowerBits == 0x8000) && (upperBits & 1)))
+			   )
+			{
+				// Round up
+				upperBits += 1;        
+
+				// Check for overflow in the exponent after rounding up
+				if (((upperBits & 0x7F80) == 0x7F80) && ((upperBits & 0x007F) == 0))
+				{
+					// Exponent overflow (the upper bits became all 1s)
+					// Set the result to infinity
+					upperBits = (upperBits & 0x8000) | 0x7F80;  // Preserve the sign bit, set exponent to 0xFF, and mantissa to 0
+				}
+			}
+		}
+
+		return (bfloat16)upperBits;
+	}
+
+	inline int bfloat16_get_exp(bfloat16 v)
+	{
+		return (int)((v >> 7) & 0xFF) - 127;
+	}
+
+	inline int bfloat16_get_mantissa(bfloat16 v)
+	{
+		return (v & 0x7F);
+	}
+
+	inline int bfloat16_get_sign(bfloat16 v)
+	{
+		return (v & 0x8000) ? -1 : 1;
+	}
+
+	inline bool bfloat16_is_nan_or_inf(bfloat16 v)
+	{
+		return ((v >> 7) & 0xFF) == 0xFF;
+	}
+
+	inline bool bfloat16_is_zero(bfloat16 v)
+	{
+		return (v & 0x7FFF) == 0;
+	}
+
+	inline bfloat16 bfloat16_init(int sign, int exp, int mant)
+	{
+		uint16_t res = (sign < 0) ? 0x8000 : 0;
+
+		assert((exp >= -126) && (res <= 127));
+		res |= ((exp + 127) << 7);
+
+		assert((mant >= 0) && (mant < 128));
+		res |= mant;
+
+		return res;
+	}
+	
+	
+} // namespace basisu
+

+ 15 - 7
thirdparty/basis_universal/encoder/basisu_opencl.cpp

@@ -789,7 +789,7 @@ namespace basisu
 
 	struct opencl_context
 	{
-		uint32_t m_ocl_total_pixel_blocks;
+		size_t m_ocl_total_pixel_blocks;
 		cl_mem m_ocl_pixel_blocks;
 
 		cl_command_queue m_command_queue;
@@ -907,7 +907,7 @@ namespace basisu
 	};
 #pragma pack(pop)
 
-	bool opencl_set_pixel_blocks(opencl_context_ptr pContext, uint32_t total_blocks, const cl_pixel_block* pPixel_blocks)
+	bool opencl_set_pixel_blocks(opencl_context_ptr pContext, size_t total_blocks, const cl_pixel_block* pPixel_blocks)
 	{
 		if (!opencl_is_available())
 			return false;
@@ -938,9 +938,11 @@ namespace basisu
 		assert(pContext->m_ocl_pixel_blocks);
 		if (!pContext->m_ocl_pixel_blocks)
 			return false;
+
+		assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX);
 				
 		cl_encode_etc1s_param_struct ps;
-		ps.m_total_blocks = pContext->m_ocl_total_pixel_blocks;
+		ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks;
 		ps.m_perceptual = perceptual;
 		ps.m_total_perms = total_perms;
 
@@ -1062,9 +1064,11 @@ exit:
 		assert(pContext->m_ocl_pixel_blocks);
 		if (!pContext->m_ocl_pixel_blocks)
 			return false;
+
+		assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX);
 				
 		cl_rec_param_struct ps;
-		ps.m_total_blocks = pContext->m_ocl_total_pixel_blocks;
+		ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks;
 		ps.m_perceptual = perceptual;
 
 		bool status = false;
@@ -1118,8 +1122,10 @@ exit:
 		if (!pContext->m_ocl_pixel_blocks)
 			return false;
 
+		assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX);
+
 		fosc_param_struct ps;
-		ps.m_total_blocks = pContext->m_ocl_total_pixel_blocks;
+		ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks;
 		ps.m_perceptual = perceptual;
 		
 		bool status = false;
@@ -1170,8 +1176,10 @@ exit:
 		if (!pContext->m_ocl_pixel_blocks)
 			return false;
 
+		assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX);
+
 		ds_param_struct ps;
-		ps.m_total_blocks = pContext->m_ocl_total_pixel_blocks;
+		ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks;
 		ps.m_perceptual = perceptual;
 
 		bool status = false;
@@ -1232,7 +1240,7 @@ namespace basisu
 		BASISU_NOTE_UNUSED(context);
 	}
 
-	bool opencl_set_pixel_blocks(opencl_context_ptr pContext, uint32_t total_blocks, const cl_pixel_block* pPixel_blocks)
+	bool opencl_set_pixel_blocks(opencl_context_ptr pContext, size_t total_blocks, const cl_pixel_block* pPixel_blocks)
 	{
 		BASISU_NOTE_UNUSED(pContext);
 		BASISU_NOTE_UNUSED(total_blocks);

+ 1 - 1
thirdparty/basis_universal/encoder/basisu_opencl.h

@@ -43,7 +43,7 @@ namespace basisu
 	// Must match BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE
 	const uint32_t OPENCL_ENCODE_ETC1S_MAX_PERMS = 165;
 
-	bool opencl_set_pixel_blocks(opencl_context_ptr pContext, uint32_t total_blocks, const cl_pixel_block* pPixel_blocks);
+	bool opencl_set_pixel_blocks(opencl_context_ptr pContext, size_t total_blocks, const cl_pixel_block* pPixel_blocks);
 
 	bool opencl_encode_etc1s_blocks(opencl_context_ptr pContext, etc_block* pOutput_blocks, bool perceptual, uint32_t total_perms);
 

+ 11 - 15
thirdparty/basis_universal/encoder/basisu_resample_filters.cpp

@@ -20,8 +20,7 @@
 
 namespace basisu
 {
-#define BOX_FILTER_SUPPORT (0.5f)
-	static float box_filter(float t) /* pulse/Fourier window */
+	float box_filter(float t) /* pulse/Fourier window */
 	{
 		// make_clist() calls the filter function with t inverted (pos = left, neg = right)
 		if ((t >= -0.5f) && (t < 0.5f))
@@ -29,9 +28,8 @@ namespace basisu
 		else
 			return 0.0f;
 	}
-
-#define TENT_FILTER_SUPPORT (1.0f)
-	static float tent_filter(float t) /* box (*) box, bilinear/triangle */
+		
+	float tent_filter(float t) /* box (*) box, bilinear/triangle */
 	{
 		if (t < 0.0f)
 			t = -t;
@@ -42,8 +40,7 @@ namespace basisu
 			return 0.0f;
 	}
 
-#define BELL_SUPPORT (1.5f)
-	static float bell_filter(float t) /* box (*) box (*) box */
+	float bell_filter(float t) /* box (*) box (*) box */
 	{
 		if (t < 0.0f)
 			t = -t;
@@ -201,13 +198,12 @@ namespace basisu
 			return (0.0f);
 	}
 
-#define GAUSSIAN_SUPPORT (1.25f)
-	static float gaussian_filter(float t) // with blackman window
+	float gaussian_filter(float t) // with blackman window
 	{
 		if (t < 0)
 			t = -t;
-		if (t < GAUSSIAN_SUPPORT)
-			return clean(exp(-2.0f * t * t) * sqrt(2.0f / M_PI) * blackman_exact_window(t / GAUSSIAN_SUPPORT));
+		if (t < BASISU_GAUSSIAN_FILTER_SUPPORT)
+			return clean(exp(-2.0f * t * t) * sqrt(2.0f / M_PI) * blackman_exact_window(t / BASISU_GAUSSIAN_FILTER_SUPPORT));
 		else
 			return 0.0f;
 	}
@@ -310,9 +306,9 @@ namespace basisu
 
 	const resample_filter g_resample_filters[] =
 	{
-		{ "box", box_filter, BOX_FILTER_SUPPORT }, 
-		{ "tent", tent_filter, TENT_FILTER_SUPPORT }, 
-		{ "bell", bell_filter, BELL_SUPPORT }, 
+		{ "box", box_filter, BASISU_BOX_FILTER_SUPPORT },
+		{ "tent", tent_filter, BASISU_TENT_FILTER_SUPPORT }, 
+		{ "bell", bell_filter, BASISU_BELL_FILTER_SUPPORT }, 
 		{ "b-spline", B_spline_filter, B_SPLINE_SUPPORT },
 		{ "mitchell", mitchell_filter, MITCHELL_SUPPORT }, 
 		{ "blackman", blackman_filter, BLACKMAN_SUPPORT }, 
@@ -321,7 +317,7 @@ namespace basisu
 		{ "lanczos6", lanczos6_filter, LANCZOS6_SUPPORT }, 
 		{ "lanczos12", lanczos12_filter, LANCZOS12_SUPPORT }, 
 		{ "kaiser", kaiser_filter, KAISER_SUPPORT }, 
-		{ "gaussian", gaussian_filter, GAUSSIAN_SUPPORT },
+		{ "gaussian", gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT },
 		{ "catmullrom", catmull_rom_filter, CATMULL_ROM_SUPPORT }, 
 		{ "quadratic_interp", quadratic_interp_filter, QUADRATIC_SUPPORT }, 
 		{ "quadratic_approx", quadratic_approx_filter, QUADRATIC_SUPPORT }, 

+ 1 - 1
thirdparty/basis_universal/encoder/basisu_resampler.cpp

@@ -573,7 +573,7 @@ namespace basisu
 
 		/* Don't deallocate a contibutor list
 		* if the user passed us one of their own.
-	*/
+		*/
 
 		if ((m_Pclist_x) && (!m_clist_x_forced))
 		{

+ 2 - 0
thirdparty/basis_universal/encoder/basisu_resampler.h

@@ -113,6 +113,8 @@ namespace basisu
 			Resample_Real filter_scale,
 			Resample_Real src_ofs);
 
+		static void free_clist(Contrib_List* p) { if (p) { free(p->p); free(p); } }
+
 	private:
 		Resampler();
 		Resampler(const Resampler &o);

+ 12 - 0
thirdparty/basis_universal/encoder/basisu_resampler_filters.h

@@ -29,6 +29,18 @@ namespace basisu
 
 	extern const resample_filter g_resample_filters[];
 	extern const int g_num_resample_filters;
+		
+	const float BASISU_BOX_FILTER_SUPPORT = 0.5f;
+	float box_filter(float t); /* pulse/Fourier window */
+
+	const float BASISU_TENT_FILTER_SUPPORT = 1.0f;
+	float tent_filter(float t); /* box (*) box, bilinear/triangle */
+
+	const float BASISU_GAUSSIAN_FILTER_SUPPORT = 1.25f;
+	float gaussian_filter(float t); // with blackman window
+
+	const float BASISU_BELL_FILTER_SUPPORT = 1.5f;
+	float bell_filter(float t); /* box (*) box (*) box */
 
 	int find_resample_filter(const char *pName);
 

+ 2 - 0
thirdparty/basis_universal/encoder/basisu_ssim.cpp

@@ -91,6 +91,8 @@ namespace basisu
 
 	void gaussian_filter(imagef &dst, const imagef &orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor)
 	{
+		assert(&dst != &orig_img);
+
 		assert(odd_filter_width && (odd_filter_width & 1));
 		odd_filter_width |= 1;
 

+ 1 - 32
thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp

@@ -3813,32 +3813,7 @@ namespace basisu
 			return hash_hsieh((const uint8_t*)&s, sizeof(s));
 		}
 	};
-
-	class tracked_stat
-	{
-	public:
-		tracked_stat() { clear(); }
-
-		void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
-
-		void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; }
-
-		tracked_stat& operator += (uint32_t val) { update(val); return *this; }
-
-		uint32_t get_number_of_values() { return m_num; }
-		uint64_t get_total() const { return m_total; }
-		uint64_t get_total2() const { return m_total2; }
-
-		float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; };
-		float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
-		float get_variance() const { float s = get_std_dev(); return s * s; }
-
-	private:
-		uint32_t m_num;
-		uint64_t m_total;
-		uint64_t m_total2;
-	};
-		
+				
 	static bool uastc_rdo_blocks(uint32_t first_index, uint32_t last_index, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params& params, uint32_t flags, 
 		uint32_t &total_skipped, uint32_t &total_refined, uint32_t &total_modified, uint32_t &total_smooth)
 	{
@@ -4150,9 +4125,7 @@ namespace basisu
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(num_blocks, block_index_iter + blocks_per_job);
 
-#ifndef __EMSCRIPTEN__
 				pJob_pool->add_job([first_index, last_index, pBlocks, pBlock_pixels, &params, flags, &total_skipped, &total_modified, &total_refined, &total_smooth, &all_succeeded, &stat_mutex] {
-#endif
 
 					uint32_t job_skipped = 0, job_modified = 0, job_refined = 0, job_smooth = 0;
 
@@ -4168,16 +4141,12 @@ namespace basisu
 						total_smooth += job_smooth;
 					}
 
-#ifndef __EMSCRIPTEN__
 					}
 				);
-#endif
 
 			} // block_index_iter
 
-#ifndef __EMSCRIPTEN__
 			pJob_pool->wait_for_all();
-#endif
 
 			status = all_succeeded;
 		}

+ 1277 - 0
thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.cpp

@@ -0,0 +1,1277 @@
+// basisu_uastc_hdr_4x4_enc.cpp
+#include "basisu_uastc_hdr_4x4_enc.h"
+#include "../transcoder/basisu_transcoder.h"
+
+using namespace basist;
+
+namespace basisu
+{
+
+const uint32_t UHDR_MODE11_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, UHDR_MODE11_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS;
+const uint32_t UHDR_MODE7_PART1_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, UHDR_MODE7_PART1_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS;
+const uint32_t UHDR_MODE7_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, UHDR_MODE7_PART2_LAST_ISE_RANGE = astc_helpers::BISE_8_LEVELS;
+const uint32_t UHDR_MODE11_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, UHDR_MODE11_PART2_LAST_ISE_RANGE = astc_helpers::BISE_4_LEVELS;
+
+uastc_hdr_4x4_codec_options::uastc_hdr_4x4_codec_options() :
+	astc_hdr_codec_base_options()
+{
+	init();
+}
+
+void uastc_hdr_4x4_codec_options::init()
+{
+	astc_hdr_codec_base_options::init();
+	
+	// This was the log bias we used on the initial release. It's too low.
+	//m_q_log_bias = Q_LOG_BIAS_4x4;
+		
+	m_q_log_bias = Q_LOG_BIAS_6x6;
+
+	m_bc6h_err_weight = .85f;
+
+#if 0
+	// HACK HACK
+	m_disable_weight_plane_optimization = true;
+	m_take_first_non_clamping_mode11_submode = false;
+	m_take_first_non_clamping_mode7_submode = false;
+#endif
+				
+	// Must set the quality level at least once to reset this struct.
+	set_quality_level(cDefaultLevel);
+}
+
+void uastc_hdr_4x4_codec_options::set_quality_best()
+{
+	// highest achievable quality
+	m_mode11_direct_only = false;
+
+	m_use_solid = true;
+
+	m_use_mode11_part1 = true;
+	m_mode11_uber_mode = true;
+	m_first_mode11_weight_ise_range = UHDR_MODE11_FIRST_ISE_RANGE;
+	m_last_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE;
+	m_first_mode11_submode = -1;
+	m_last_mode11_submode = 7;
+
+	m_use_mode7_part1 = true;
+	m_first_mode7_part1_weight_ise_range = UHDR_MODE7_PART1_FIRST_ISE_RANGE;
+	m_last_mode7_part1_weight_ise_range = UHDR_MODE7_PART1_LAST_ISE_RANGE;
+	m_mode7_full_s_optimization = true;
+
+	m_use_mode7_part2 = true;
+	m_mode7_part2_part_masks = UINT32_MAX;
+	m_first_mode7_part2_weight_ise_range = UHDR_MODE7_PART2_FIRST_ISE_RANGE;
+	m_last_mode7_part2_weight_ise_range = UHDR_MODE7_PART2_LAST_ISE_RANGE;
+
+	m_use_mode11_part2 = true;
+	m_mode11_part2_part_masks = UINT32_MAX;
+	m_first_mode11_part2_weight_ise_range = UHDR_MODE11_PART2_FIRST_ISE_RANGE;
+	m_last_mode11_part2_weight_ise_range = UHDR_MODE11_PART2_LAST_ISE_RANGE;
+
+	m_refine_weights = true;
+
+	m_use_estimated_partitions = false;
+	m_max_estimated_partitions = 0;
+}
+
+void uastc_hdr_4x4_codec_options::set_quality_normal()
+{
+	m_use_solid = true;
+
+	// We'll allow uber mode in normal if the user allows it.
+	m_use_mode11_part1 = true;
+	m_mode11_uber_mode = true;
+	m_first_mode11_weight_ise_range = 6;
+	m_last_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE;
+
+	m_use_mode7_part1 = true;
+	m_first_mode7_part1_weight_ise_range = UHDR_MODE7_PART1_LAST_ISE_RANGE;
+	m_last_mode7_part1_weight_ise_range = UHDR_MODE7_PART1_LAST_ISE_RANGE;
+
+	m_use_mode7_part2 = true;
+	m_mode7_part2_part_masks = UINT32_MAX;
+	m_first_mode7_part2_weight_ise_range = UHDR_MODE7_PART2_LAST_ISE_RANGE;
+	m_last_mode7_part2_weight_ise_range = UHDR_MODE7_PART2_LAST_ISE_RANGE;
+
+	m_use_mode11_part2 = true;
+	m_mode11_part2_part_masks = UINT32_MAX;
+	m_first_mode11_part2_weight_ise_range = UHDR_MODE11_PART2_LAST_ISE_RANGE;
+	m_last_mode11_part2_weight_ise_range = UHDR_MODE11_PART2_LAST_ISE_RANGE;
+
+	m_refine_weights = true;
+}
+
+void uastc_hdr_4x4_codec_options::set_quality_fastest()
+{
+	m_use_solid = true;
+
+	m_use_mode11_part1 = true;
+	m_mode11_uber_mode = false;
+	m_first_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE;
+	m_last_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE;
+
+	m_use_mode7_part1 = false;
+	m_mode7_full_s_optimization = false;
+
+	m_use_mode7_part2 = false;
+	m_use_mode11_part2 = false;
+
+	m_refine_weights = false;
+}
+
+void uastc_hdr_4x4_codec_options::set_quality_level(int level)
+{
+	level = clamp(level, cMinLevel, cMaxLevel);
+
+	m_level = level;
+
+	// First ensure all options are set to best.
+	set_quality_best();
+
+	switch (level)
+	{
+	case 0:
+	{
+		set_quality_fastest();
+		break;
+	}
+	case 1:
+	{
+		set_quality_normal();
+
+		m_first_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE - 1;
+		m_last_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE;
+
+		m_use_mode7_part1 = false;
+		m_mode7_full_s_optimization = false;
+		m_use_mode7_part2 = false;
+
+		m_use_estimated_partitions = true;
+		m_max_estimated_partitions = 1;
+
+		m_mode11_part2_part_masks = 1 | 2;
+		m_mode7_part2_part_masks = 1 | 2;
+
+		// TODO: Disabling this hurts BC6H quality, but significantly speeds up compression.
+		//m_refine_weights = false;
+		break;
+	}
+	case 2:
+	{
+		set_quality_normal();
+
+		m_use_estimated_partitions = true;
+		m_max_estimated_partitions = 2;
+
+		m_mode11_part2_part_masks = 1 | 2;
+		m_mode7_part2_part_masks = 1 | 2;
+
+		break;
+	}
+	case 3:
+	{
+		m_use_estimated_partitions = true;
+		m_max_estimated_partitions = 2;
+
+		m_mode11_part2_part_masks = 1 | 2 | 4 | 8;
+		m_mode7_part2_part_masks = 1 | 2 | 4 | 8;
+
+		break;
+	}
+	default:
+	{
+		// best options already set
+		break;
+	}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static bool pack_solid(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_4x4_pack_results>& all_results, const uastc_hdr_4x4_codec_options& coptions)
+{
+	float r = 0.0f, g = 0.0f, b = 0.0f;
+
+	const float LOG_BIAS = .125f;
+
+	bool solid_block = true;
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		if ((pBlock_linear_colors[0][0] != pBlock_linear_colors[i][0]) ||
+			(pBlock_linear_colors[0][1] != pBlock_linear_colors[i][1]) ||
+			(pBlock_linear_colors[0][2] != pBlock_linear_colors[i][2]))
+		{
+			solid_block = false;
+		}
+
+		r += log2f(pBlock_linear_colors[i][0] + LOG_BIAS);
+		g += log2f(pBlock_linear_colors[i][1] + LOG_BIAS);
+		b += log2f(pBlock_linear_colors[i][2] + LOG_BIAS);
+	}
+
+	if (solid_block)
+	{
+		r = pBlock_linear_colors[0][0];
+		g = pBlock_linear_colors[0][1];
+		b = pBlock_linear_colors[0][2];
+	}
+	else
+	{
+		r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / 16.0f)) - LOG_BIAS);
+		g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / 16.0f)) - LOG_BIAS);
+		b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / 16.0f)) - LOG_BIAS);
+
+		// for safety
+		r = minimum<float>(r, MAX_HALF_FLOAT);
+		g = minimum<float>(g, MAX_HALF_FLOAT);
+		b = minimum<float>(b, MAX_HALF_FLOAT);
+	}
+
+	half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b), ah = float_to_half_non_neg_no_nan_inf(1.0f);
+
+	astc_hdr_4x4_pack_results results;
+	results.clear();
+
+	uint8_t* packed_blk = (uint8_t*)&results.m_solid_blk;
+	results.m_is_solid = true;
+
+	packed_blk[0] = 0b11111100;
+	packed_blk[1] = 255;
+	packed_blk[2] = 255;
+	packed_blk[3] = 255;
+	packed_blk[4] = 255;
+	packed_blk[5] = 255;
+	packed_blk[6] = 255;
+	packed_blk[7] = 255;
+
+	packed_blk[8] = (uint8_t)rh;
+	packed_blk[9] = (uint8_t)(rh >> 8);
+	packed_blk[10] = (uint8_t)gh;
+	packed_blk[11] = (uint8_t)(gh >> 8);
+	packed_blk[12] = (uint8_t)bh;
+	packed_blk[13] = (uint8_t)(bh >> 8);
+	packed_blk[14] = (uint8_t)ah;
+	packed_blk[15] = (uint8_t)(ah >> 8);
+
+	results.m_best_block_error = 0;
+
+	if (!solid_block)
+	{
+		const float R_WEIGHT = coptions.m_r_err_scale;
+		const float G_WEIGHT = coptions.m_g_err_scale;
+
+		// This MUST match how errors are computed in eval_selectors().
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			half_float dr = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]), dg = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]), db = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]);
+			double rd = q(rh, Q_LOG_BIAS_4x4) - q(dr, Q_LOG_BIAS_4x4);
+			double gd = q(gh, Q_LOG_BIAS_4x4) - q(dg, Q_LOG_BIAS_4x4);
+			double bd = q(bh, Q_LOG_BIAS_4x4) - q(db, Q_LOG_BIAS_4x4);
+
+			double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+
+			results.m_best_block_error += e;
+		}
+	}
+
+	const half_float hc[3] = { rh, gh, bh };
+
+	bc6h_enc_block_solid_color(&results.m_bc6h_block, hc);
+
+	all_results.push_back(results);
+
+	return solid_block;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_mode11(
+	const vec4F* pBlock_linear_colors, const half_float pBlock_pixels_half[16][3], const vec4F pBlock_pixels_q16[16],
+	basisu::vector<astc_hdr_4x4_pack_results>& all_results,
+	const uastc_hdr_4x4_codec_options& coptions,
+	uint32_t first_weight_ise_range, uint32_t last_weight_ise_range, bool constrain_ise_weight_selectors)
+{
+	BASISU_NOTE_UNUSED(pBlock_linear_colors);
+	assert(first_weight_ise_range <= last_weight_ise_range);
+
+	uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16];
+	uint32_t trial_submode11 = 0;
+
+	clear_obj(trial_endpoints);
+	clear_obj(trial_weights);
+		
+	for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++)
+	{
+		const bool direct_only = coptions.m_mode11_direct_only;
+		
+		uint32_t endpoint_ise_range = astc_helpers::BISE_256_LEVELS;
+		if (weight_ise_range == astc_helpers::BISE_16_LEVELS)
+			endpoint_ise_range = astc_helpers::BISE_192_LEVELS;
+		else
+		{
+			assert(weight_ise_range < astc_helpers::BISE_16_LEVELS);
+		}
+				
+		double trial_error = encode_astc_hdr_block_mode_11(16, pBlock_pixels_half, pBlock_pixels_q16, weight_ise_range, trial_submode11, BIG_FLOAT_VAL, trial_endpoints, trial_weights, coptions, direct_only,
+			endpoint_ise_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, constrain_ise_weight_selectors, coptions.m_first_mode11_submode, coptions.m_last_mode11_submode, false, cOrdinaryLeastSquares);
+
+		if (trial_error < BIG_FLOAT_VAL)
+		{
+			astc_hdr_4x4_pack_results results;
+			results.clear();
+
+			results.m_best_block_error = trial_error;
+
+			results.m_best_submodes[0] = trial_submode11;
+			results.m_constrained_weights = constrain_ise_weight_selectors;
+						
+			results.m_best_blk.m_num_partitions = 1;
+			results.m_best_blk.m_color_endpoint_modes[0] = 11;
+			results.m_best_blk.m_weight_ise_range = (uint8_t)weight_ise_range;
+			results.m_best_blk.m_endpoint_ise_range = (uint8_t)endpoint_ise_range;
+			
+			memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
+			memcpy(results.m_best_blk.m_weights, trial_weights, 16);
+
+#ifdef _DEBUG
+			// Sanity checking
+			{
+				half_float block_pixels_half[16][3];
+								
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]);
+					block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]);
+					block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]);
+				}
+				
+				half_float unpacked_astc_blk_rgba[4][4][4];
+				bool res = astc_helpers::decode_block(results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
+				assert(res);
+
+				half_float unpacked_astc_blk_rgb[4][4][3];
+				for (uint32_t y = 0; y < 4; y++)
+					for (uint32_t x = 0; x < 4; x++)
+						for (uint32_t c = 0; c < 3; c++)
+							unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c];
+
+				double cmp_err = compute_block_error(16, &block_pixels_half[0][0], &unpacked_astc_blk_rgb[0][0][0], coptions);
+				assert(results.m_best_block_error == cmp_err);
+			}
+#endif
+
+			// transcode to BC6H
+			assert(results.m_best_blk.m_color_endpoint_modes[0] == 11);
+			
+			// Get qlog12 endpoints
+			int e[2][3];
+			bool success = decode_mode11_to_qlog12(results.m_best_blk.m_endpoints, e, results.m_best_blk.m_endpoint_ise_range);
+			assert(success);
+			BASISU_NOTE_UNUSED(success);
+
+			// Transform endpoints to half float
+			half_float h_e[3][2] =
+			{
+				{ qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) },
+				{ qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) },
+				{ qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) }
+			};
+
+			// Transcode to bc6h
+			success = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block);
+			assert(success);
+
+			all_results.push_back(results);
+		}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_mode7_single_part(
+	const half_float pBlock_pixels_half[16][3], const vec4F pBlock_pixels_q16[16],
+	basisu::vector<astc_hdr_4x4_pack_results>& all_results, const uastc_hdr_4x4_codec_options& coptions,
+	uint32_t first_mode7_part1_weight_ise_range, uint32_t last_mode7_part1_weight_ise_range)
+{
+	assert(first_mode7_part1_weight_ise_range <= last_mode7_part1_weight_ise_range);
+
+	uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16];
+	uint32_t trial_submode7 = 0;
+
+	clear_obj(trial_endpoints);
+	clear_obj(trial_weights);
+
+	for (uint32_t weight_ise_range = first_mode7_part1_weight_ise_range; weight_ise_range <= last_mode7_part1_weight_ise_range; weight_ise_range++)
+	{
+		const uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS;
+
+		double trial_error = encode_astc_hdr_block_mode_7(16, pBlock_pixels_half, pBlock_pixels_q16, weight_ise_range, trial_submode7, BIG_FLOAT_VAL, trial_endpoints, trial_weights, coptions, ise_endpoint_range);
+
+		if (trial_error < BIG_FLOAT_VAL)
+		{
+			astc_hdr_4x4_pack_results results;
+			results.clear();
+
+			results.m_best_block_error = trial_error;
+
+			results.m_best_submodes[0] = trial_submode7;
+			
+			results.m_best_blk.m_num_partitions = 1;
+			results.m_best_blk.m_color_endpoint_modes[0] = 7;
+			results.m_best_blk.m_weight_ise_range = (uint8_t)weight_ise_range;
+			results.m_best_blk.m_endpoint_ise_range = (uint8_t)ise_endpoint_range;
+			
+			memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
+			memcpy(results.m_best_blk.m_weights, trial_weights, 16);
+
+			// transcode to BC6H
+			assert(results.m_best_blk.m_color_endpoint_modes[0] == 7);
+			
+			// Get qlog12 endpoints
+			int e[2][3];
+			if (!decode_mode7_to_qlog12(results.m_best_blk.m_endpoints, e, nullptr, results.m_best_blk.m_endpoint_ise_range))
+				continue;
+
+			// Transform endpoints to half float
+			half_float h_e[3][2] =
+			{
+				{ qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) },
+				{ qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) },
+				{ qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) }
+			};
+
+			// Transcode to bc6h
+			bool status = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block);
+			assert(status);
+			(void)status;
+
+			all_results.push_back(results);
+		}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static bool estimate_partition(
+	const half_float pBlock_pixels_half[16][3],
+	int* pBest_parts, uint32_t num_best_parts)
+{
+	assert(num_best_parts <= basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+
+	vec3F training_vecs[16], mean(0.0f);
+
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		vec3F& v = training_vecs[i];
+
+		v[0] = (float)pBlock_pixels_half[i][0];
+		v[1] = (float)pBlock_pixels_half[i][1];
+		v[2] = (float)pBlock_pixels_half[i][2];
+
+		mean += v;
+	}
+	mean *= (1.0f / 16.0f);
+
+	vec3F cluster_centroids[2] = { mean - vec3F(.1f), mean + vec3F(.1f) };
+
+	uint32_t cluster_pixels[2][16];
+	uint32_t num_cluster_pixels[2];
+	vec3F new_cluster_means[2];
+
+	for (uint32_t s = 0; s < 4; s++)
+	{
+		num_cluster_pixels[0] = 0;
+		num_cluster_pixels[1] = 0;
+
+		new_cluster_means[0].clear();
+		new_cluster_means[1].clear();
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			float d0 = training_vecs[i].squared_distance(cluster_centroids[0]);
+			float d1 = training_vecs[i].squared_distance(cluster_centroids[1]);
+
+			if (d0 < d1)
+			{
+				cluster_pixels[0][num_cluster_pixels[0]] = i;
+				new_cluster_means[0] += training_vecs[i];
+				num_cluster_pixels[0]++;
+			}
+			else
+			{
+				cluster_pixels[1][num_cluster_pixels[1]] = i;
+				new_cluster_means[1] += training_vecs[i];
+				num_cluster_pixels[1]++;
+			}
+		}
+
+		if (!num_cluster_pixels[0] || !num_cluster_pixels[1])
+			return false;
+
+		cluster_centroids[0] = new_cluster_means[0] / (float)num_cluster_pixels[0];
+		cluster_centroids[1] = new_cluster_means[1] / (float)num_cluster_pixels[1];
+	}
+
+	int desired_parts[4][4]; // [y][x]
+	for (uint32_t p = 0; p < 2; p++)
+	{
+		for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
+		{
+			const uint32_t pix_index = cluster_pixels[p][i];
+
+			desired_parts[pix_index >> 2][pix_index & 3] = p;
+		}
+	}
+
+	uint32_t part_similarity[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2];
+
+	for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; part_index++)
+	{
+		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
+
+		int total_sim_non_inv = 0;
+		int total_sim_inv = 0;
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				int part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+
+				if (part == desired_parts[y][x])
+					total_sim_non_inv++;
+
+				if ((part ^ 1) == desired_parts[y][x])
+					total_sim_inv++;
+			}
+		}
+
+		int total_sim = maximum(total_sim_non_inv, total_sim_inv);
+
+		part_similarity[part_index] = (total_sim << 8) | part_index;
+
+	} // part_index;
+
+	std::sort(part_similarity, part_similarity + basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+
+	for (uint32_t i = 0; i < num_best_parts; i++)
+		pBest_parts[i] = part_similarity[(basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 - 1) - i] & 0xFF;
+
+	return true;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_mode7_2part(
+	const half_float pBlock_pixels_half[16][3], const vec4F pBlock_pixels_q16[16],
+	basisu::vector<astc_hdr_4x4_pack_results>& all_results, const uastc_hdr_4x4_codec_options& coptions,
+	int num_estimated_partitions, const int *pEstimated_partitions,
+	uint32_t first_weight_ise_range, uint32_t last_weight_ise_range)
+{
+	assert(coptions.m_mode7_part2_part_masks);
+
+	astc_helpers::log_astc_block trial_blk;
+	clear_obj(trial_blk);
+	trial_blk.m_grid_width = 4;
+	trial_blk.m_grid_height = 4;
+
+	trial_blk.m_num_partitions = 2;
+	trial_blk.m_color_endpoint_modes[0] = 7;
+	trial_blk.m_color_endpoint_modes[1] = 7;
+
+	uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2;
+		
+	if (num_estimated_partitions)
+	{
+		first_part_index = 0;
+		last_part_index = num_estimated_partitions;
+	}
+	
+	for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter)
+	{
+		uint32_t part_index;
+		if (num_estimated_partitions)
+		{
+			part_index = pEstimated_partitions[part_index_iter];
+			assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+		}
+		else
+		{
+			part_index = part_index_iter;
+			if (((1U << part_index) & coptions.m_mode7_part2_part_masks) == 0)
+				continue;
+		}
+								
+		const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
+		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
+		const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert;
+				
+		half_float part_pixels_half[2][16][3];
+		vec4F part_pixels_q16[2][16];
+
+		uint32_t pixel_part_index[4][4]; // [y][x]
+		uint32_t num_part_pixels[2] = { 0, 0 };
+
+		// Extract each subset's texels for this partition pattern
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+				if (invert_flag)
+					part = 1 - part;
+
+				pixel_part_index[y][x] = part;
+								
+				const uint32_t n = num_part_pixels[part];
+
+				part_pixels_half[part][n][0] = pBlock_pixels_half[x + y * 4][0];
+				part_pixels_half[part][n][1] = pBlock_pixels_half[x + y * 4][1];
+				part_pixels_half[part][n][2] = pBlock_pixels_half[x + y * 4][2];
+				part_pixels_q16[part][n] = pBlock_pixels_q16[x + y * 4];
+
+				num_part_pixels[part] = n + 1;
+			}
+		}
+
+		trial_blk.m_partition_id = (uint16_t)astc_pattern;
+				
+		for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++)
+		{
+			assert(weight_ise_range <= astc_helpers::BISE_8_LEVELS);
+
+			uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS;
+			if (weight_ise_range == astc_helpers::BISE_5_LEVELS)
+				ise_endpoint_range = astc_helpers::BISE_192_LEVELS;
+			else if (weight_ise_range == astc_helpers::BISE_6_LEVELS)
+				ise_endpoint_range = astc_helpers::BISE_128_LEVELS;
+			else if (weight_ise_range == astc_helpers::BISE_8_LEVELS)
+				ise_endpoint_range = astc_helpers::BISE_80_LEVELS;
+
+			uint8_t trial_endpoints[2][NUM_MODE7_ENDPOINTS], trial_weights[2][16];
+			uint32_t trial_submode7[2];
+
+			clear_obj(trial_endpoints);
+			clear_obj(trial_weights);
+			clear_obj(trial_submode7);
+
+			double total_trial_err = 0;
+			for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
+			{
+				total_trial_err += encode_astc_hdr_block_mode_7(
+					num_part_pixels[pack_part_index], part_pixels_half[pack_part_index], part_pixels_q16[pack_part_index],
+					weight_ise_range, trial_submode7[pack_part_index], BIG_FLOAT_VAL,
+					&trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, ise_endpoint_range);
+
+			} // pack_part_index
+
+			if (total_trial_err < BIG_FLOAT_VAL)
+			{
+				trial_blk.m_weight_ise_range = (uint8_t)weight_ise_range;
+				trial_blk.m_endpoint_ise_range = (uint8_t)ise_endpoint_range;
+
+				for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
+					memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE7_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE7_ENDPOINTS);
+
+				uint32_t src_pixel_index[2] = { 0, 0 };
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						uint32_t p = pixel_part_index[y][x];
+						trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++];
+					}
+				}
+								
+				astc_hdr_4x4_pack_results results;
+				results.clear();
+
+				results.m_best_block_error = total_trial_err;
+				results.m_best_submodes[0] = trial_submode7[0];
+				results.m_best_submodes[1] = trial_submode7[1];
+				results.m_best_pat_index = part_index;
+
+				results.m_best_blk = trial_blk;
+
+				bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block);
+				assert(status);
+				BASISU_NOTE_UNUSED(status);
+
+				all_results.push_back(results);
+			}
+
+		} // weight_ise_range
+
+	} // part_index
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_mode11_2part(
+	const half_float pBlock_pixels_half[16][3], const vec4F pBlock_pixels_q16[16],
+	basisu::vector<astc_hdr_4x4_pack_results>& all_results, const uastc_hdr_4x4_codec_options& coptions,
+	int num_estimated_partitions, const int* pEstimated_partitions)
+{
+	assert(coptions.m_mode11_part2_part_masks);
+
+	astc_helpers::log_astc_block trial_blk;
+	clear_obj(trial_blk);
+	trial_blk.m_grid_width = 4;
+	trial_blk.m_grid_height = 4;
+
+	trial_blk.m_num_partitions = 2;
+	trial_blk.m_color_endpoint_modes[0] = 11;
+	trial_blk.m_color_endpoint_modes[1] = 11;
+			
+	uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2;
+
+	if (num_estimated_partitions)
+	{
+		first_part_index = 0;
+		last_part_index = num_estimated_partitions;
+	}
+
+	for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter)
+	{
+		uint32_t part_index;
+		if (num_estimated_partitions)
+		{
+			part_index = pEstimated_partitions[part_index_iter];
+			assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+		}
+		else
+		{
+			part_index = part_index_iter;
+			if (((1U << part_index) & coptions.m_mode11_part2_part_masks) == 0)
+				continue;
+		}
+
+		const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
+		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
+		const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert;
+
+		half_float part_pixels_half[2][16][3];
+		vec4F part_pixels_q16[2][16];
+
+		uint32_t pixel_part_index[4][4]; // [y][x]
+		uint32_t num_part_pixels[2] = { 0, 0 };
+
+		// Extract each subset's texels for this partition pattern
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+				if (invert_flag)
+					part = 1 - part;
+
+				pixel_part_index[y][x] = part;
+				
+				const uint32_t n = num_part_pixels[part];
+
+				part_pixels_half[part][n][0] = pBlock_pixels_half[x + y * 4][0];
+				part_pixels_half[part][n][1] = pBlock_pixels_half[x + y * 4][1];
+				part_pixels_half[part][n][2] = pBlock_pixels_half[x + y * 4][2];
+				part_pixels_q16[part][n] = pBlock_pixels_q16[x + y * 4];
+
+				num_part_pixels[part] = n + 1;
+			}
+		}
+				
+		trial_blk.m_partition_id = (uint16_t)astc_pattern;
+						
+		for (uint32_t weight_ise_range = coptions.m_first_mode11_part2_weight_ise_range; weight_ise_range <= coptions.m_last_mode11_part2_weight_ise_range; weight_ise_range++)
+		{
+			bool direct_only = false;
+			uint32_t ise_endpoint_range = astc_helpers::BISE_64_LEVELS;
+			if (weight_ise_range == astc_helpers::BISE_4_LEVELS)
+				ise_endpoint_range = astc_helpers::BISE_40_LEVELS;
+
+			uint8_t trial_endpoints[2][NUM_MODE11_ENDPOINTS], trial_weights[2][16];
+			uint32_t trial_submode11[2];
+
+			clear_obj(trial_endpoints); 
+			clear_obj(trial_weights);
+			clear_obj(trial_submode11);
+
+			double total_trial_err = 0;
+			for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
+			{
+				total_trial_err += encode_astc_hdr_block_mode_11(
+					num_part_pixels[pack_part_index], part_pixels_half[pack_part_index], part_pixels_q16[pack_part_index],
+					weight_ise_range, trial_submode11[pack_part_index], BIG_FLOAT_VAL,
+					&trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions,
+					direct_only, ise_endpoint_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, false,
+					coptions.m_first_mode11_submode, coptions.m_last_mode11_submode, false, cOrdinaryLeastSquares);
+
+			} // pack_part_index
+
+			if (total_trial_err < BIG_FLOAT_VAL)
+			{
+				trial_blk.m_weight_ise_range = (uint8_t)weight_ise_range;
+				trial_blk.m_endpoint_ise_range = (uint8_t)ise_endpoint_range;
+
+				for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
+					memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE11_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE11_ENDPOINTS);
+
+				uint32_t src_pixel_index[2] = { 0, 0 };
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						uint32_t p = pixel_part_index[y][x];
+						trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++];
+					}
+				}
+								
+				astc_hdr_4x4_pack_results results;
+				results.clear();
+
+				results.m_best_block_error = total_trial_err;
+				results.m_best_submodes[0] = trial_submode11[0];
+				results.m_best_submodes[1] = trial_submode11[1];
+				results.m_best_pat_index = part_index;
+
+				results.m_best_blk = trial_blk;
+
+				bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block);
+				assert(status);
+				BASISU_NOTE_UNUSED(status);
+
+				all_results.push_back(results);
+			}
+
+		} // weight_ise_range
+
+	} // part_index
+}
+
+bool astc_hdr_4x4_enc_block(
+	const float* pRGBPixels, const basist::half_float *pRGBPixelsHalf,
+	const uastc_hdr_4x4_codec_options& coptions,
+	basisu::vector<astc_hdr_4x4_pack_results>& all_results)
+{
+	assert(g_astc_hdr_enc_initialized);
+	if (!g_astc_hdr_enc_initialized)
+	{
+		// astc_hdr_enc_init() MUST be called first.
+		assert(0);
+		return false;
+	}
+
+	assert(coptions.m_use_solid || coptions.m_use_mode11_part1 || coptions.m_use_mode7_part2 || coptions.m_use_mode7_part1 || coptions.m_use_mode11_part2);
+
+	all_results.resize(0);
+
+	const half_float (*pBlock_pixels_half)[16][3] = reinterpret_cast<const half_float(*)[16][3]>(pRGBPixelsHalf);
+				
+	vec4F block_linear_colors[16];
+	vec4F block_pixels_q16[16];
+
+	bool is_greyscale = true;
+		
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		const float fr = pRGBPixels[i * 3 + 0], fg = pRGBPixels[i * 3 + 1], fb = pRGBPixels[i * 3 + 2];
+
+		// Sanity check the input block.
+		assert((fr >= 0) && (fr <= MAX_HALF_FLOAT) && (!std::isinf(fr)) && (!std::isnan(fr)));
+		assert((fg >= 0) && (fg <= MAX_HALF_FLOAT) && (!std::isinf(fg)) && (!std::isnan(fg)));
+		assert((fb >= 0) && (fb <= MAX_HALF_FLOAT) && (!std::isinf(fb)) && (!std::isnan(fb)));
+
+		block_linear_colors[i].set(fr, fg, fb, 1.0f);
+
+		const half_float hr = (*pBlock_pixels_half)[i][0];
+		assert(hr == basist::float_to_half(fr));
+		block_pixels_q16[i][0] = (float)half_to_qlog16(hr);
+
+		const half_float hg = (*pBlock_pixels_half)[i][1];
+		assert(hg == basist::float_to_half(fg));
+		block_pixels_q16[i][1] = (float)half_to_qlog16(hg);
+
+		const half_float hb = (*pBlock_pixels_half)[i][2];
+		assert(hb == basist::float_to_half(fb));
+		block_pixels_q16[i][2] = (float)half_to_qlog16(hb);
+		
+		block_pixels_q16[i][3] = 0.0f;
+
+		if ((hr != hg) || (hr != hb))
+			is_greyscale = false;
+	} // i
+							
+	bool is_solid = false;
+	if (coptions.m_use_solid)
+		is_solid = pack_solid(block_linear_colors, all_results, coptions);
+
+	if (!is_solid)
+	{
+		if ((is_greyscale) && (coptions.m_level == 0))
+		{
+			// Special case if it's a pure grayscale block - just try mode 7.
+			pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, 1, 1);
+			pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, UHDR_MODE7_PART1_LAST_ISE_RANGE, UHDR_MODE7_PART1_LAST_ISE_RANGE);
+		}
+		else
+		{
+			if (coptions.m_use_mode11_part1)
+			{
+				const size_t cur_num_results = all_results.size();
+
+				pack_mode11(block_linear_colors, *pBlock_pixels_half, block_pixels_q16, all_results, coptions, coptions.m_first_mode11_weight_ise_range, coptions.m_last_mode11_weight_ise_range, false);
+								
+				if (coptions.m_last_mode11_weight_ise_range >= astc_helpers::BISE_12_LEVELS)
+				{
+					// Try constrained weights if we're allowed to use 12/16 level ISE weight modes
+					pack_mode11(block_linear_colors, *pBlock_pixels_half, block_pixels_q16, all_results, coptions, maximum<uint32_t>(coptions.m_first_mode11_weight_ise_range, astc_helpers::BISE_12_LEVELS), coptions.m_last_mode11_weight_ise_range, true);
+				}
+
+				// If we couldn't get any mode 11 results at all, and we were restricted to just trying weight ISE range 8 (which required endpoint quantization) then 
+				// fall back to weight ISE range 7 (which doesn't need any endpoint quantization).
+				// This is to guarantee we always get at least 1 non-solid result.
+				if (all_results.size() == cur_num_results)
+				{
+					if (coptions.m_first_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS)
+					{
+						pack_mode11(block_linear_colors, *pBlock_pixels_half, block_pixels_q16, all_results, coptions, astc_helpers::BISE_12_LEVELS, astc_helpers::BISE_12_LEVELS, false);
+					}
+				}
+			}
+
+			if (coptions.m_use_mode7_part1)
+			{
+				// Mode 7 1-subset never requires endpoint quantization, so it cannot fail to find at least one usable solution.
+				pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, coptions.m_first_mode7_part1_weight_ise_range, coptions.m_last_mode7_part1_weight_ise_range);
+			}
+			else if (is_greyscale)
+			{
+				// Special case if it's a pure grayscale block and mode 7 was disabled - try it anyway, because mode 11 has worse B channel quantization.
+				pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, 1, 1);
+				pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, UHDR_MODE7_PART1_LAST_ISE_RANGE, UHDR_MODE7_PART1_LAST_ISE_RANGE);
+			}
+		}
+						
+		bool have_est = false;
+		int best_parts[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2];
+
+		if ((coptions.m_use_mode7_part2) || (coptions.m_use_mode11_part2))
+		{
+			if (coptions.m_use_estimated_partitions)
+				have_est = estimate_partition(*pBlock_pixels_half, best_parts, coptions.m_max_estimated_partitions);
+		}
+
+		if (coptions.m_use_mode7_part2)
+		{
+			const size_t cur_num_results = all_results.size();
+
+			pack_mode7_2part(*pBlock_pixels_half, block_pixels_q16,
+				all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, 
+				coptions.m_first_mode7_part2_weight_ise_range, coptions.m_last_mode7_part2_weight_ise_range);
+
+			// If we couldn't find any packable 2-subset mode 7 results at weight levels >= 5 levels (which always requires endpoint quant), then try falling back to 
+			// 5 levels which doesn't require endpoint quantization.
+			if (all_results.size() == cur_num_results)
+			{
+				if (coptions.m_first_mode7_part2_weight_ise_range >= astc_helpers::BISE_5_LEVELS)
+				{
+					pack_mode7_2part(*pBlock_pixels_half, block_pixels_q16,
+						all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, 
+						astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_4_LEVELS);
+				}
+			}
+		}
+		
+		if (coptions.m_use_mode11_part2)
+		{
+			// This always requires endpoint quant, so it could fail to find any usable solutions.
+			pack_mode11_2part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts);
+		}
+
+		if (coptions.m_refine_weights)
+		{
+			// TODO: This is quite slow.
+			for (uint32_t i = 0; i < all_results.size(); i++)
+			{
+				bool status = astc_hdr_4x4_refine_weights(pRGBPixelsHalf, all_results[i], coptions, coptions.m_bc6h_err_weight, &all_results[i].m_improved_via_refinement_flag);
+				assert(status);
+				BASISU_NOTE_UNUSED(status);
+			}
+		}
+
+	} // !is_solid
+
+	return true;
+}
+
+bool astc_hdr_4x4_pack_results_to_block(astc_blk& dst_blk, const astc_hdr_4x4_pack_results& results)
+{
+	assert(g_astc_hdr_enc_initialized);
+	if (!g_astc_hdr_enc_initialized)
+		return false;
+
+	if (results.m_is_solid)
+	{
+		memcpy(&dst_blk, &results.m_solid_blk, sizeof(results.m_solid_blk));
+	}
+	else
+	{
+		bool status = astc_helpers::pack_astc_block((astc_helpers::astc_block&)dst_blk, results.m_best_blk);
+		if (!status)
+		{
+			assert(0);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+// Refines a block's chosen weight indices, balancing BC6H and ASTC HDR error.
+bool astc_hdr_4x4_refine_weights(const half_float *pSource_block, 
+	astc_hdr_4x4_pack_results& cur_results, const uastc_hdr_4x4_codec_options& coptions, float bc6h_weight, bool *pImproved_flag)
+{
+	if (pImproved_flag)
+		*pImproved_flag = false;
+
+	if (cur_results.m_is_solid)
+		return true;
+
+	const uint32_t total_weights = astc_helpers::get_ise_levels(cur_results.m_best_blk.m_weight_ise_range);
+	assert((total_weights >= MIN_SUPPORTED_WEIGHT_LEVELS) && (total_weights <= MAX_SUPPORTED_WEIGHT_LEVELS));
+
+	double best_err[4][4];
+	uint8_t best_weight[4][4];
+	for (uint32_t y = 0; y < 4; y++)
+	{
+		for (uint32_t x = 0; x < 4; x++)
+		{
+			best_err[y][x] = BIG_FLOAT_VAL;
+			best_weight[y][x] = 0;
+		}
+	}
+
+	astc_hdr_4x4_pack_results temp_results;
+
+	const float c_weights[3] = { coptions.m_r_err_scale, coptions.m_g_err_scale, 1.0f };
+
+	for (uint32_t weight_index = 0; weight_index < total_weights; weight_index++)
+	{
+		temp_results = cur_results;
+		for (uint32_t i = 0; i < 16; i++)
+			temp_results.m_best_blk.m_weights[i] = (uint8_t)weight_index;
+		
+		half_float unpacked_astc_blk_rgba[4][4][4];
+		bool res = astc_helpers::decode_block(temp_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
+		assert(res);
+
+		basist::bc6h_block trial_bc6h_blk;
+		res = basist::astc_hdr_transcode_to_bc6h(temp_results.m_best_blk, trial_bc6h_blk);
+		assert(res);
+				
+		half_float unpacked_bc6h_blk[4][4][3];
+		res = unpack_bc6h(&trial_bc6h_blk, unpacked_bc6h_blk, false);
+		assert(res);
+		BASISU_NOTE_UNUSED(res);
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				double total_err = 0.0f;
+
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					const half_float orig_c = pSource_block[(x + y * 4) * 3 + c];
+					const double orig_c_q = q(orig_c, Q_LOG_BIAS_4x4);
+					
+					const half_float astc_c = unpacked_astc_blk_rgba[y][x][c];
+					const double astc_c_q = q(astc_c, Q_LOG_BIAS_4x4);
+					const double astc_e = square(astc_c_q - orig_c_q) * c_weights[c];
+					
+					const half_float bc6h_c = unpacked_bc6h_blk[y][x][c];
+					const double bc6h_c_q = q(bc6h_c, Q_LOG_BIAS_4x4);
+					const double bc6h_e = square(bc6h_c_q - orig_c_q) * c_weights[c];
+
+					const double overall_err = astc_e * (1.0f - bc6h_weight) + bc6h_e * bc6h_weight;
+
+					total_err += overall_err;
+
+				} //  c
+
+				if (total_err < best_err[y][x])
+				{
+					best_err[y][x] = total_err;
+					best_weight[y][x] = (uint8_t)weight_index;
+				}
+
+			} // x
+		} // y
+
+	} // weight_index
+
+	bool any_changed = false;
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		if (cur_results.m_best_blk.m_weights[i] != best_weight[i >> 2][i & 3])
+		{
+			any_changed = true;
+			break;
+		}
+	}
+
+	if (any_changed)
+	{
+		memcpy(cur_results.m_best_blk.m_weights, best_weight, 16);
+
+		{
+			bool res = basist::astc_hdr_transcode_to_bc6h(cur_results.m_best_blk, cur_results.m_bc6h_block);
+			assert(res);
+			BASISU_NOTE_UNUSED(res);
+
+			half_float unpacked_astc_blk_rgba[4][4][4];
+			res = astc_helpers::decode_block(cur_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
+			assert(res);
+
+			half_float unpacked_astc_blk_rgb[4][4][3];
+			for (uint32_t y = 0; y < 4; y++)
+				for (uint32_t x = 0; x < 4; x++)
+					for (uint32_t c = 0; c < 3; c++)
+						unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c];
+
+			cur_results.m_best_block_error = compute_block_error(16, pSource_block, &unpacked_astc_blk_rgb[0][0][0], coptions);
+		}
+
+		if (pImproved_flag)
+			*pImproved_flag = true;
+	}
+
+	return true;
+}
+
+void astc_hdr_4x4_block_stats::update(const astc_hdr_4x4_pack_results& log_blk)
+{
+	std::lock_guard<std::mutex> lck(m_mutex);
+
+	m_total_blocks++;
+
+	if (log_blk.m_improved_via_refinement_flag)
+		m_total_refined++;
+
+	if (log_blk.m_is_solid)
+	{
+		m_total_solid++;
+	}
+	else
+	{
+		int best_weight_range = log_blk.m_best_blk.m_weight_ise_range;
+
+		if (log_blk.m_best_blk.m_color_endpoint_modes[0] == 7)
+		{
+			m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 6U)]++;
+
+			if (log_blk.m_best_blk.m_num_partitions == 2)
+			{
+				m_total_mode7_2part++;
+
+				m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 6U)]++;
+				m_total_2part++;
+
+				m_weight_range_hist_7_2part[bounds_check(best_weight_range, 0, 11)]++;
+
+				m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++;
+			}
+			else
+			{
+				m_total_mode7_1part++;
+
+				m_weight_range_hist_7[bounds_check(best_weight_range, 0, 11)]++;
+			}
+		}
+		else
+		{
+			m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 9U)]++;
+			if (log_blk.m_constrained_weights)
+				m_total_mode11_1part_constrained_weights++;
+
+			if (log_blk.m_best_blk.m_num_partitions == 2)
+			{
+				m_total_mode11_2part++;
+
+				m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 9U)]++;
+				m_total_2part++;
+
+				m_weight_range_hist_11_2part[bounds_check(best_weight_range, 0, 11)]++;
+
+				m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++;
+			}
+			else
+			{
+				m_total_mode11_1part++;
+
+				m_weight_range_hist_11[bounds_check(best_weight_range, 0, 11)]++;
+			}
+		}
+	}
+}
+
+void astc_hdr_4x4_block_stats::print()
+{
+	std::lock_guard<std::mutex> lck(m_mutex);
+
+	assert(m_total_blocks);
+	if (!m_total_blocks)
+		return;
+
+	printf("\nLow-level ASTC Encoder Statistics:\n");
+	printf("Total blocks: %u\n", m_total_blocks);
+	printf("Total solid: %u %3.2f%%\n", m_total_solid, (m_total_solid * 100.0f) / m_total_blocks);
+	printf("Total refined: %u %3.2f%%\n", m_total_refined, (m_total_refined * 100.0f) / m_total_blocks);
+
+	printf("Total mode 11, 1 partition: %u %3.2f%%\n", m_total_mode11_1part, (m_total_mode11_1part * 100.0f) / m_total_blocks);
+	printf("Total mode 11, 1 partition, constrained weights: %u %3.2f%%\n", m_total_mode11_1part_constrained_weights, (m_total_mode11_1part_constrained_weights * 100.0f) / m_total_blocks);
+	printf("Total mode 11, 2 partition: %u %3.2f%%\n", m_total_mode11_2part, (m_total_mode11_2part * 100.0f) / m_total_blocks);
+
+	printf("Total mode 7, 1 partition: %u %3.2f%%\n", m_total_mode7_1part, (m_total_mode7_1part * 100.0f) / m_total_blocks);
+	printf("Total mode 7, 2 partition: %u %3.2f%%\n", m_total_mode7_2part, (m_total_mode7_2part * 100.0f) / m_total_blocks);
+
+	printf("Total 2 partitions: %u %3.2f%%\n", m_total_2part, (m_total_2part * 100.0f) / m_total_blocks);
+	printf("\n");
+
+	printf("ISE texel weight range histogram mode 11:\n");
+	for (uint32_t i = 1; i <= UHDR_MODE11_LAST_ISE_RANGE; i++)
+		printf("%u %u\n", i, m_weight_range_hist_11[i]);
+	printf("\n");
+
+	printf("ISE texel weight range histogram mode 11, 2 partition:\n");
+	for (uint32_t i = 1; i <= UHDR_MODE11_PART2_LAST_ISE_RANGE; i++)
+		printf("%u %u\n", i, m_weight_range_hist_11_2part[i]);
+	printf("\n");
+
+	printf("ISE texel weight range histogram mode 7:\n");
+	for (uint32_t i = 1; i <= UHDR_MODE7_PART1_LAST_ISE_RANGE; i++)
+		printf("%u %u\n", i, m_weight_range_hist_7[i]);
+	printf("\n");
+
+	printf("ISE texel weight range histogram mode 7, 2 partition:\n");
+	for (uint32_t i = 1; i <= UHDR_MODE7_PART2_LAST_ISE_RANGE; i++)
+		printf("%u %u\n", i, m_weight_range_hist_7_2part[i]);
+	printf("\n");
+
+	printf("Mode 11 submode histogram:\n");
+	for (uint32_t i = 0; i <= MODE11_TOTAL_SUBMODES; i++) // +1 because of the extra direct encoding
+		printf("%u %u\n", i, m_mode11_submode_hist[i]);
+	printf("\n");
+
+	printf("Mode 7 submode histogram:\n");
+	for (uint32_t i = 0; i < MODE7_TOTAL_SUBMODES; i++)
+		printf("%u %u\n", i, m_mode7_submode_hist[i]);
+	printf("\n");
+
+	printf("Partition pattern table usage histogram:\n");
+	for (uint32_t i = 0; i < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2; i++)
+		printf("%u:%u ", i, m_part_hist[i]);
+	printf("\n\n");
+}
+
+} // namespace basisu
+

+ 16 - 59
thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h → thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.h

@@ -1,29 +1,20 @@
-// basisu_astc_hdr_enc.h
+// basisu_uastc_hdr_4x4_enc.h
 #pragma once
 #include "basisu_enc.h"
 #include "basisu_gpu_texture.h"
 #include "../transcoder/basisu_astc_helpers.h"
 #include "../transcoder/basisu_astc_hdr_core.h"
+#include "basisu_astc_hdr_common.h"
 
 namespace basisu
 {
-	// This MUST be called before encoding any blocks.
-	void astc_hdr_enc_init();
-
-	const uint32_t MODE11_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE11_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS;
-	const uint32_t MODE7_PART1_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE7_PART1_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS;
-	const uint32_t MODE7_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE7_PART2_LAST_ISE_RANGE = astc_helpers::BISE_8_LEVELS;
-	const uint32_t MODE11_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE11_PART2_LAST_ISE_RANGE = astc_helpers::BISE_4_LEVELS;
-	const uint32_t MODE11_TOTAL_SUBMODES = 8; // plus an extra hidden submode, directly encoded, for direct, so really 9 (see tables 99/100 of the ASTC spec)
-	const uint32_t MODE7_TOTAL_SUBMODES = 6;
-		
-	struct astc_hdr_codec_options
+	struct uastc_hdr_4x4_codec_options : astc_hdr_codec_base_options
 	{
 		float m_bc6h_err_weight;
 
 		bool m_use_solid;
 
-		bool m_use_mode11;
+		bool m_use_mode11_part1;
 		bool m_mode11_uber_mode;
 		uint32_t m_first_mode11_weight_ise_range;
 		uint32_t m_last_mode11_weight_ise_range;
@@ -45,8 +36,6 @@ namespace basisu
 		uint32_t m_first_mode11_part2_weight_ise_range;
 		uint32_t m_last_mode11_part2_weight_ise_range;
 
-		float m_r_err_scale, m_g_err_scale;
-
 		bool m_refine_weights;
 
 		uint32_t m_level;
@@ -54,13 +43,10 @@ namespace basisu
 		bool m_use_estimated_partitions;
 		uint32_t m_max_estimated_partitions;
 
-		// If true, the ASTC HDR compressor is allowed to more aggressively vary weight indices for slightly higher compression in non-fastest mode. This will hurt BC6H quality, however.
-		bool m_allow_uber_mode;
-
-		astc_hdr_codec_options();
+		uastc_hdr_4x4_codec_options();
 
 		void init();
-				
+
 		// TODO: set_quality_level() is preferred to configure the codec for transcoding purposes.
 		static const int cMinLevel = 0;
 		static const int cMaxLevel = 4;
@@ -73,7 +59,7 @@ namespace basisu
 		void set_quality_fastest();
 	};
 
-	struct astc_hdr_pack_results
+	struct astc_hdr_4x4_pack_results
 	{
 		double m_best_block_error;
 		double m_bc6h_block_error; // note this is not used/set by the encoder, here for convienance 
@@ -119,35 +105,6 @@ namespace basisu
 		}
 	};
 			
-	void interpolate_qlog12_colors(
-		const int e[2][3],
-		basist::half_float* pDecoded_half,
-		vec3F* pDecoded_float,
-		uint32_t n, uint32_t ise_weight_range);
-		
-	bool get_astc_hdr_mode_11_block_colors(
-		const uint8_t* pEndpoints,
-		basist::half_float* pDecoded_half,
-		vec3F* pDecoded_float,
-		uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);
-		
-	bool get_astc_hdr_mode_7_block_colors(
-		const uint8_t* pEndpoints,
-		basist::half_float* pDecoded_half,
-		vec3F* pDecoded_float,
-		uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);
-
-	double eval_selectors(
-		uint32_t num_pixels,
-		uint8_t* pWeights,
-		const basist::half_float* pBlock_pixels_half,
-		uint32_t num_weight_levels,
-		const basist::half_float* pDecoded_half,
-		const astc_hdr_codec_options& coptions,
-		uint32_t usable_selector_bitmask = UINT32_MAX);
-
-	double compute_block_error(const basist::half_float* pOrig_block, const basist::half_float* pPacked_block, const astc_hdr_codec_options& coptions);
-
 	// Encodes a 4x4 ASTC HDR block given a 4x4 array of source block pixels/texels.
 	// Supports solid color blocks, mode 11 (all submodes), mode 7/1 partition (all submodes), 
 	// and mode 7/2 partitions (all submodes) - 30 patterns, only the ones also in common with the BC6H format.
@@ -164,16 +121,16 @@ namespace basisu
 	// astc_hdr_enc_init() MUST have been called first to initialized the codec.
 	// Input pixels are checked and cannot be NaN's, Inf's, signed, or too large (greater than MAX_HALF_FLOAT, or 65504). 
 	// Normal values and denormals are okay.
-	bool astc_hdr_enc_block(
-		const float* pRGBPixels,
-		const astc_hdr_codec_options& coptions,
-		basisu::vector<astc_hdr_pack_results> &all_results);
+	bool astc_hdr_4x4_enc_block(
+		const float* pRGBPixels, const basist::half_float *pRGBPixelsHalf,
+		const uastc_hdr_4x4_codec_options& coptions,
+		basisu::vector<astc_hdr_4x4_pack_results> &all_results);
 
-	bool astc_hdr_pack_results_to_block(basist::astc_blk& dst_blk, const astc_hdr_pack_results& results);
+	bool astc_hdr_4x4_pack_results_to_block(basist::astc_blk& dst_blk, const astc_hdr_4x4_pack_results& results);
 		
-	bool astc_hdr_refine_weights(const basist::half_float* pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool* pImproved_flag);
+	bool astc_hdr_4x4_refine_weights(const basist::half_float* pSource_block, astc_hdr_4x4_pack_results& cur_results, const uastc_hdr_4x4_codec_options& coptions, float bc6h_weight, bool* pImproved_flag);
 
-	struct astc_hdr_block_stats
+	struct astc_hdr_4x4_block_stats
 	{
 		std::mutex m_mutex;
 
@@ -195,7 +152,7 @@ namespace basisu
 
 		uint32_t m_total_refined;
 								
-		astc_hdr_block_stats() { clear(); }
+		astc_hdr_4x4_block_stats() { clear(); }
 
 		void clear()
 		{
@@ -215,7 +172,7 @@ namespace basisu
 			clear_obj(m_part_hist);
 		}
 
-		void update(const astc_hdr_pack_results& log_blk);
+		void update(const astc_hdr_4x4_pack_results& log_blk);
 		
 		void print();
 	};

+ 3 - 3
thirdparty/basis_universal/patches/0001-external-zstd-pr344.patch

@@ -1,5 +1,5 @@
 diff --git a/thirdparty/basis_universal/encoder/basisu_comp.cpp b/thirdparty/basis_universal/encoder/basisu_comp.cpp
-index f16e75bd46..81813257cd 100644
+index 59a2a50900..e9aa20f313 100644
 --- a/thirdparty/basis_universal/encoder/basisu_comp.cpp
 +++ b/thirdparty/basis_universal/encoder/basisu_comp.cpp
 @@ -33,7 +33,7 @@
@@ -12,10 +12,10 @@ index f16e75bd46..81813257cd 100644
  
  // Set to 1 to disable the mipPadding alignment workaround (which only seems to be needed when no key-values are written at all)
 diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
-index ea994b0c4f..32018cd282 100644
+index 0f7ca1565f..d7bce42013 100644
 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
 +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
-@@ -164,7 +164,7 @@
+@@ -169,7 +169,7 @@
     // If BASISD_SUPPORT_KTX2_ZSTD is 0, UASTC files compressed with Zstd cannot be loaded.
  	#if BASISD_SUPPORT_KTX2_ZSTD
  		// We only use two Zstd API's: ZSTD_decompress() and ZSTD_isError()

+ 11 - 2
thirdparty/basis_universal/patches/0002-external-jpgd.patch

@@ -1,8 +1,8 @@
 diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp
-index 47e8981bc3..6c0ac0ad37 100644
+index b9804090b1..5987685ae7 100644
 --- a/thirdparty/basis_universal/encoder/basisu_enc.cpp
 +++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
-@@ -458,7 +458,7 @@ namespace basisu
+@@ -492,7 +492,7 @@ namespace basisu
  	bool load_jpg(const char *pFilename, image& img)
  	{
  		int width = 0, height = 0, actual_comps = 0;
@@ -11,3 +11,12 @@ index 47e8981bc3..6c0ac0ad37 100644
  		if (!pImage_data)
  			return false;
  		
+@@ -512,7 +512,7 @@ namespace basisu
+ 		}
+ 
+ 		int width = 0, height = 0, actual_comps = 0;
+-		uint8_t* pImage_data = jpgd::decompress_jpeg_image_from_memory(pBuf, (int)buf_size, &width, &height, &actual_comps, 4, jpgd::jpeg_decoder::cFlagLinearChromaFiltering);
++		uint8_t* pImage_data = jpgd::decompress_jpeg_image_from_memory(pBuf, (int)buf_size, &width, &height, &actual_comps, 4, jpgd::jpeg_decoder::cFlagBoxChromaFiltering);
+ 		if (!pImage_data)
+ 			return false;
+ 

+ 3 - 3
thirdparty/basis_universal/patches/0003-external-tinyexr.patch

@@ -1,8 +1,8 @@
 diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp
-index 6c0ac0ad37..2bf486a028 100644
+index 7904aab91c..4d885cba16 100644
 --- a/thirdparty/basis_universal/encoder/basisu_enc.cpp
 +++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
-@@ -27,7 +27,7 @@
+@@ -29,7 +29,7 @@
  #ifndef TINYEXR_USE_ZFP
  #define TINYEXR_USE_ZFP (1)
  #endif
@@ -11,7 +11,7 @@ index 6c0ac0ad37..2bf486a028 100644
  
  #ifndef MINIZ_HEADER_FILE_ONLY
  #define MINIZ_HEADER_FILE_ONLY
-@@ -3257,7 +3257,8 @@ namespace basisu
+@@ -3420,7 +3420,8 @@ namespace basisu
  		float* out_rgba = nullptr;
  		const char* err = nullptr;
  		

+ 11 - 11
thirdparty/basis_universal/patches/0004-remove-tinydds-qoi.patch

@@ -1,8 +1,8 @@
 diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp
-index 2bf486a028..fff98e8301 100644
+index 4d885cba16..6c2cf0260e 100644
 --- a/thirdparty/basis_universal/encoder/basisu_enc.cpp
 +++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
-@@ -37,9 +37,6 @@
+@@ -39,9 +39,6 @@
  #endif
  #include "basisu_miniz.h"
  
@@ -12,7 +12,7 @@ index 2bf486a028..fff98e8301 100644
  #if defined(_WIN32)
  // For QueryPerformanceCounter/QueryPerformanceFrequency
  #define WIN32_LEAN_AND_MEAN
-@@ -408,16 +405,7 @@ namespace basisu
+@@ -453,16 +450,7 @@ namespace basisu
  
  	bool load_qoi(const char* pFilename, image& img)
  	{
@@ -31,7 +31,7 @@ index 2bf486a028..fff98e8301 100644
  
  	bool load_png(const uint8_t *pBuf, size_t buf_size, image &img, const char *pFilename)
 diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
-index 000869a533..648cfb47ae 100644
+index 339218fcf2..028ac3f314 100644
 --- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
 +++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
 @@ -19,9 +19,6 @@
@@ -41,10 +41,10 @@ index 000869a533..648cfb47ae 100644
 -#define TINYDDS_IMPLEMENTATION
 -#include "3rdparty/tinydds.h"
 -
+ #define BASISU_USE_GOOGLE_ASTC_DECODER (1)
+ 
  namespace basisu
- {
- 	//------------------------------------------------------------------------------------------------
-@@ -1980,207 +1977,7 @@ namespace basisu
+@@ -2049,207 +2046,7 @@ namespace basisu
  	// and cubemap, cubemap mipmapped, and cubemap array mipmapped.
  	bool write_dds_file(uint8_vec &dds_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag, bool use_srgb_format)
  	{
@@ -63,11 +63,11 @@ index 000869a533..648cfb47ae 100644
 -				assert(0);
 -				return false;
 -			}
--			slices = gpu_images.size() / 6;
+-			slices = gpu_images.size_u32() / 6;
 -		}
 -		else
 -		{
--			slices = gpu_images.size();
+-			slices = gpu_images.size_u32();
 -		}
 -
 -		uint32_t width = 0, height = 0, total_levels = 0;
@@ -185,7 +185,7 @@ index 000869a533..648cfb47ae 100644
 -		assert(total_levels < 32);
 -		for (uint32_t i = 0; i < total_levels; i++)
 -		{
--			mipmap_sizes[i] = mipmaps[i].size_in_bytes();
+-			mipmap_sizes[i] = mipmaps[i].size_in_bytes_u32();
 -			mipmap_ptrs[i] = mipmaps[i].get_ptr();
 -		}
 -
@@ -253,7 +253,7 @@ index 000869a533..648cfb47ae 100644
  	}
  
  	bool write_dds_file(const char* pFilename, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag, bool use_srgb_format)
-@@ -2201,188 +1998,6 @@ namespace basisu
+@@ -2270,188 +2067,6 @@ namespace basisu
  		
  	bool read_uncompressed_dds_file(const char* pFilename, basisu::vector<image> &ldr_mips,	basisu::vector<imagef>& hdr_mips)
  	{

+ 13 - 0
thirdparty/basis_universal/patches/0005-windows-illegal-character.patch

@@ -0,0 +1,13 @@
+diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h
+index 0d6d2ae936..8b82ad8c29 100644
+--- a/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h
++++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h
+@@ -16,7 +16,7 @@ namespace astc_6x6_hdr
+ 	{
+ 		// Important: The Delta ITP colorspace error metric we use internally makes several assumptions about the nature of the HDR RGB inputs supplied to the encoder.
+ 		// This encoder computes colorspace error in the ICtCp (or more accurately the delta ITP, where CT is scaled by .5 vs. ICtCp to become T) colorspace, so getting this correct is important.
+-		// By default the encoder assumes the input is in absolute luminance (in nits or candela per square meter, cd/m▓), specified as positive-only linear light RGB, using the REC 709 colorspace gamut (but NOT the sRGB transfer function, i.e. linear light).
++		// By default the encoder assumes the input is in absolute luminance (in nits or candela per square meter, cd/m^2), specified as positive-only linear light RGB, using the REC 709 colorspace gamut (but NOT the sRGB transfer function, i.e. linear light).
+ 		// If the m_rec2020_bt2100_color_gamut flag is true, the input colorspace is treated as REC 2020/BT.2100 (which is wider than 709).
+ 		// For SDR/LDR->HDR upconversion, the REC 709 sRGB input should be converted to linear light (sRGB->linear) and the resulting normalized linear RGB values scaled by either 80 or 100 nits (the luminance of a typical SDR monitor). 
+ 		// SDR upconversion to normalized [0,1] (i.e. non-absolute) luminances may work but is not supported because ITP errors will not be predicted correctly.

+ 22 - 0
thirdparty/basis_universal/patches/0006-ambiguous-calls.patch

@@ -0,0 +1,22 @@
+diff --git a/thirdparty/basis_universal/transcoder/basisu_containers.h b/thirdparty/basis_universal/transcoder/basisu_containers.h
+index 03fae33974..7fff4c243e 100644
+--- a/thirdparty/basis_universal/transcoder/basisu_containers.h
++++ b/thirdparty/basis_universal/transcoder/basisu_containers.h
+@@ -3349,7 +3349,7 @@ namespace basisu
+ 
+ 		inline size_t hash_key(const Key& k) const
+ 		{
+-			assert((safe_shift_left(1ULL, (SIZE_T_BITS - m_hash_shift))) == m_values.size());
++			assert((safe_shift_left(static_cast<uint64_t>(1), (SIZE_T_BITS - m_hash_shift))) == m_values.size());
+ 
+ 			// Fibonacci hashing
+ 			if (SIZE_T_BITS == 32)
+@@ -3433,7 +3433,7 @@ namespace basisu
+ 				return false;
+ 
+ 			new_map.m_hash_shift = SIZE_T_BITS - helpers::floor_log2i((uint64_t)new_hash_size);
+-			assert(new_hash_size == safe_shift_left(1ULL, SIZE_T_BITS - new_map.m_hash_shift));
++			assert(new_hash_size == safe_shift_left(static_cast<uint64_t>(1), SIZE_T_BITS - new_map.m_hash_shift));
+ 
+ 			new_map.m_grow_threshold = std::numeric_limits<size_t>::max();
+ 

+ 163 - 54
thirdparty/basis_universal/transcoder/basisu.h

@@ -20,34 +20,7 @@
 	#pragma warning (disable : 4201)
 	#pragma warning (disable : 4127) // warning C4127: conditional expression is constant
 	#pragma warning (disable : 4530) // C++ exception handler used, but unwind semantics are not enabled.
-
-	// Slamming this off always for v1.16 because we've gotten rid of most std containers.
-	#ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL
-		#define BASISU_NO_ITERATOR_DEBUG_LEVEL (1)
-	#endif
-
-	#ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL
-		//#define _HAS_ITERATOR_DEBUGGING 0
-
-		#if defined(_DEBUG) || defined(DEBUG)
-			// This is madness, but we need to disable iterator debugging in debug builds or the encoder is unsable because MSVC's iterator debugging implementation is totally broken.
-			#ifndef _ITERATOR_DEBUG_LEVEL
-			#define _ITERATOR_DEBUG_LEVEL 1
-			#endif
-			#ifndef _SECURE_SCL
-			#define _SECURE_SCL 1
-			#endif
-		#else // defined(_DEBUG) || defined(DEBUG)
-			#ifndef _SECURE_SCL
-			#define _SECURE_SCL 0
-			#endif
-			#ifndef _ITERATOR_DEBUG_LEVEL
-			#define _ITERATOR_DEBUG_LEVEL 0
-			#endif
-		#endif // defined(_DEBUG) || defined(DEBUG)
-
-	#endif // BASISU_NO_ITERATOR_DEBUG_LEVEL
-
+	
 #endif // _MSC_VER
 
 #include <stdlib.h>
@@ -66,6 +39,7 @@
 #include <type_traits>
 #include <assert.h>
 #include <random>
+#include <inttypes.h>
 
 #include "basisu_containers.h"
 
@@ -114,6 +88,7 @@ namespace basisu
 	typedef basisu::vector<int16_t> int16_vec;
 	typedef basisu::vector<uint16_t> uint16_vec;
 	typedef basisu::vector<uint32_t> uint_vec;
+	typedef basisu::vector<size_t> size_t_vec;
 	typedef basisu::vector<uint64_t> uint64_vec;
 	typedef basisu::vector<int> int_vec;
 	typedef basisu::vector<bool> bool_vec;
@@ -121,6 +96,16 @@ namespace basisu
 
 	void enable_debug_printf(bool enabled);
 	void debug_printf(const char *pFmt, ...);
+	void debug_puts(const char* p);
+
+	template <typename... Args>
+	inline void fmt_debug_printf(const char* pFmt, Args&&... args)
+	{
+		std::string res;
+		if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward<Args>(args))... }))
+			return;
+		debug_puts(res.c_str());
+	}
 
 #ifndef __EMSCRIPTEN__
 #ifdef __GNUC__
@@ -137,16 +122,13 @@ namespace basisu
 #endif                            
 #endif
 
-	template <typename T0, typename T1> inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; }
-
-	template <typename S> inline S maximum(S a, S b) { return (a > b) ? a : b; }
-	template <typename S> inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
-	template <typename S> inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
-	
-	template <typename S> inline S minimum(S a, S b) {	return (a < b) ? a : b; }
-	template <typename S> inline S minimum(S a, S b, S c) {	return minimum(minimum(a, b), c); }
-	template <typename S> inline S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); }
+	constexpr double cPiD = 3.14159265358979323846264338327950288;
+	constexpr float REALLY_SMALL_FLOAT_VAL = .000000125f;
+	constexpr float SMALL_FLOAT_VAL = .0000125f;
+	constexpr float BIG_FLOAT_VAL = 1e+30f;
 
+	template <typename T0, typename T1> inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; }
+		
 	inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
 	inline float saturate(float value) { return clampf(value, 0, 1.0f); }
 	inline uint8_t minimumub(uint8_t a, uint8_t b) { return (a < b) ? a : b; }
@@ -159,10 +141,31 @@ namespace basisu
 	inline float maximumf(float a, float b) { return (a > b) ? a : b; }
 	inline int squarei(int i) { return i * i; }
 	inline float squaref(float i) { return i * i; }
+	inline double squared(double i) { return i * i; }
 	template<typename T> inline T square(T a) { return a * a; }
+	template<typename T> inline T sign(T a) { return (a < 0) ? (T)-1 : ((a == 0) ? (T)0 : (T)1); }
+		
+	inline bool equal_tol(float a, float b, float t) { return fabsf(a - b) <= ((maximum(fabsf(a), fabsf(b)) + 1.0f) * t); }
+	inline bool equal_tol(double a, double b, double t) { return fabs(a - b) <= ((maximum(fabs(a), fabs(b)) + 1.0f) * t); }
 
-	template <typename S> inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); }
+	template <class T>
+	inline T prev_wrap(T i, T n)
+	{
+		T temp = i - 1;
+		if (temp < 0)
+			temp = n - 1;
+		return temp;
+	}
 
+	template <class T>
+	inline T next_wrap(T i, T n)
+	{
+		T temp = i + 1;
+		if (temp >= n)
+			temp = 0;
+		return temp;
+	}
+		
 	inline uint32_t iabs(int32_t i) { return (i < 0) ? static_cast<uint32_t>(-i) : static_cast<uint32_t>(i);	}
 	inline uint64_t iabs64(int64_t i) {	return (i < 0) ? static_cast<uint64_t>(-i) : static_cast<uint64_t>(i); }
 
@@ -356,6 +359,7 @@ namespace basisu
 			return *this;
 		}
 
+#if 0
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"            
@@ -414,6 +418,57 @@ namespace basisu
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
+#else
+		inline operator uint32_t() const
+		{
+			if constexpr (NumBytes == 1)
+			{
+				return m_bytes[0];
+			}
+			else if constexpr (NumBytes == 2)
+			{
+				return (m_bytes[1] << 8U) | m_bytes[0];
+			}
+			else if constexpr (NumBytes == 3)
+			{
+				return (m_bytes[2] << 16U) | (m_bytes[1] << 8U) | m_bytes[0];
+			}
+			else if constexpr (NumBytes == 4)
+			{
+				return read_le_dword(m_bytes);
+			}
+			else if constexpr (NumBytes == 5)
+			{
+				uint32_t l = read_le_dword(m_bytes);
+				uint32_t h = m_bytes[4];
+				return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+			}
+			else if constexpr (NumBytes == 6)
+			{
+				uint32_t l = read_le_dword(m_bytes);
+				uint32_t h = (m_bytes[5] << 8U) | m_bytes[4];
+				return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+			}
+			else if constexpr (NumBytes == 7)
+			{
+				uint32_t l = read_le_dword(m_bytes);
+				uint32_t h = (m_bytes[6] << 16U) | (m_bytes[5] << 8U) | m_bytes[4];
+				return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+			}
+			else if constexpr (NumBytes == 8)
+			{
+				uint32_t l = read_le_dword(m_bytes);
+				uint32_t h = read_le_dword(m_bytes + 4);
+				return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+			}
+			else
+			{
+				static_assert(NumBytes <= 8, "Invalid NumBytes");
+				return 0;
+			}
+		}
+		#endif
+
 	};
 
 	enum eZero { cZero };
@@ -446,18 +501,18 @@ namespace basisu
 	static const uint8_t g_huffman_sorted_codelength_codes[] = { cHuffmanSmallZeroRunCode, cHuffmanBigZeroRunCode,	cHuffmanSmallRepeatCode, cHuffmanBigRepeatCode, 0, 8, 7, 9, 6, 0xA, 5, 0xB, 4, 0xC, 3, 0xD, 2, 0xE, 1, 0xF, 0x10 };
 	const uint32_t cHuffmanTotalSortedCodelengthCodes = sizeof(g_huffman_sorted_codelength_codes) / sizeof(g_huffman_sorted_codelength_codes[0]);
 
-	// GPU texture formats
+	// GPU texture formats and various uncompressed texture formats.
 
 	enum class texture_format
 	{
 		cInvalidTextureFormat = -1,
 		
 		// Block-based formats
-		cETC1,			// ETC1
-		cETC1S,			// ETC1 (subset: diff colors only, no subblocks)
-		cETC2_RGB,		// ETC2 color block (basisu doesn't support ETC2 planar/T/H modes - just basic ETC1)
-		cETC2_RGBA,		// ETC2 EAC alpha block followed by ETC2 color block
-		cETC2_ALPHA,	// ETC2 EAC alpha block 
+		cETC1,				// ETC1
+		cETC1S,				// ETC1 (subset: diff colors only, no subblocks)
+		cETC2_RGB,			// ETC2 color block (basisu doesn't support ETC2 planar/T/H modes - just basic ETC1)
+		cETC2_RGBA,			// ETC2 EAC alpha block followed by ETC2 color block
+		cETC2_ALPHA,		// ETC2 EAC alpha block 
 		cBC1,				// DXT1
 		cBC3,				// DXT5 (BC4/DXT5A block followed by a BC1/DXT1 block)
 		cBC4,				// DXT5A
@@ -466,7 +521,8 @@ namespace basisu
 		cBC6HUnsigned,		// HDR
 		cBC7,
 		cASTC_LDR_4x4,		// ASTC 4x4 LDR only
-		cASTC_HDR_4x4,		// ASTC 4x4 HDR only (but may use LDR ASTC blocks internally)
+		cASTC_HDR_4x4,		// ASTC 4x4 HDR only (but may use LDR ASTC blocks internally, although our encoders don't do this)
+		cASTC_HDR_6x6,		// ASTC 6x6 HDR only (but may use LDR ASTC blocks internally, although our encoders don't do this)
 		cPVRTC1_4_RGB,
 		cPVRTC1_4_RGBA,
 		cATC_RGB,
@@ -491,8 +547,33 @@ namespace basisu
 		cRGB_9E5
 	};
 
+	inline bool is_uncompressed_texture_format(texture_format fmt)
+	{
+		switch (fmt)
+		{
+		case texture_format::cRGBA32:
+		case texture_format::cRGB565:
+		case texture_format::cBGR565:
+		case texture_format::cRGBA4444:
+		case texture_format::cABGR4444:
+		case texture_format::cRGBA_HALF:
+		case texture_format::cRGB_HALF:
+		case texture_format::cRGB_9E5:
+			return true;
+		default:
+			break;
+		}
+
+		return false;
+	}
+
+	inline bool is_block_based_texture_format(texture_format fmt)
+	{
+		return !is_uncompressed_texture_format(fmt);
+	}
+
 	// This is bytes per block for GPU formats, or bytes per texel for uncompressed formats.
-	inline uint32_t get_bytes_per_block(texture_format fmt)
+	inline uint32_t get_bytes_per_block_or_pixel(texture_format fmt)
 	{
 		switch (fmt)
 		{
@@ -534,16 +615,22 @@ namespace basisu
 	// This is qwords per block for GPU formats, or not valid for uncompressed formats.
 	inline uint32_t get_qwords_per_block(texture_format fmt)
 	{
-		return get_bytes_per_block(fmt) >> 3;
+		assert(is_block_based_texture_format(fmt));
+
+		const uint32_t bytes_per_block = get_bytes_per_block_or_pixel(fmt);
+		return bytes_per_block >> 3;
 	}
 
 	inline uint32_t get_block_width(texture_format fmt)
 	{
-		BASISU_NOTE_UNUSED(fmt);
+		assert(is_block_based_texture_format(fmt));
+
 		switch (fmt)
 		{
 		case texture_format::cFXT1_RGB:
 			return 8;
+		case texture_format::cASTC_HDR_6x6:
+			return 6;
 		default:
 			break;
 		}
@@ -552,20 +639,42 @@ namespace basisu
 
 	inline uint32_t get_block_height(texture_format fmt)
 	{
-		BASISU_NOTE_UNUSED(fmt);
+		assert(is_block_based_texture_format(fmt));
+
+		switch (fmt)
+		{
+		case texture_format::cASTC_HDR_6x6:
+			return 6;
+		default:
+			break;
+		}
 		return 4;
 	}
 
 	inline bool is_hdr_texture_format(texture_format fmt)
 	{
-		if (fmt == texture_format::cASTC_HDR_4x4)
-			return true;
-		if (fmt == texture_format::cUASTC_HDR_4x4)
-			return true;
-		if ((fmt == texture_format::cBC6HSigned) || (fmt == texture_format::cBC6HUnsigned))
+		switch (fmt)
+		{
+		case texture_format::cASTC_HDR_4x4:
+		case texture_format::cUASTC_HDR_4x4:
+		case texture_format::cASTC_HDR_6x6:
+		case texture_format::cBC6HSigned:
+		case texture_format::cBC6HUnsigned:
+		case texture_format::cRGBA_HALF:
+		case texture_format::cRGB_HALF:
+		case texture_format::cRGB_9E5:
 			return true;
+		default:
+			break;
+		}
+
 		return false;
 	}
+
+	inline bool is_ldr_texture_format(texture_format fmt)
+	{
+		return !is_hdr_texture_format(fmt);
+	}
 							
 } // namespace basisu
 

+ 130 - 26
thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h

@@ -35,40 +35,17 @@ namespace basist
 	const uint32_t MAX_QLOG16 = 63487;
 	const float MAX_QLOG16_VAL = 65504.0f;
 
+	// TODO: Should be called something like "NUM_MODE11_ENDPOINT_VALUES"
 	const uint32_t NUM_MODE11_ENDPOINTS = 6, NUM_MODE7_ENDPOINTS = 4;
 
-	// Notes:
-	// qlog16_to_half(half_to_qlog16(half_val_as_int)) == half_val_as_int (is lossless)
-	// However, this is not lossless in the general sense.
-	inline half_float qlog16_to_half_slow(uint32_t qlog16)
-	{
-		assert(qlog16 <= 0xFFFF);
-
-		int C = qlog16;
-
-		int E = (C & 0xF800) >> 11;
-		int M = C & 0x7FF;
-
-		int Mt;
-		if (M < 512)
-			Mt = 3 * M;
-		else if (M >= 1536)
-			Mt = 5 * M - 2048;
-		else
-			Mt = 4 * M - 512;
-
-		int Cf = (E << 10) + (Mt >> 3);
-		return (half_float)Cf;
-	}
-
 	// This is not lossless
-	inline half_float qlog_to_half_slow(uint32_t qlog, uint32_t bits)
+	inline half_float qlog_to_half(uint32_t qlog, uint32_t bits)
 	{
 		assert((bits >= 7U) && (bits <= 16U));
 		assert(qlog < (1U << bits));
 
 		int C = qlog << (16 - bits);
-		return qlog16_to_half_slow(C);
+		return astc_helpers::qlog16_to_half(C);
 	}
 
 	void astc_hdr_core_init();
@@ -99,4 +76,131 @@ namespace basist
 	bool astc_hdr_transcode_to_bc6h(const astc_blk& src_blk, bc6h_block& dst_blk);
 	bool astc_hdr_transcode_to_bc6h(const astc_helpers::log_astc_block& log_blk, bc6h_block& dst_blk);
 
+	namespace astc_6x6_hdr
+	{
+		const uint32_t MAX_ASTC_HDR_6X6_DIM = 32768;
+		const int32_t REUSE_MAX_BUFFER_ROWS = 5; // 1+-(-4), so we need to buffer 5 rows total
+
+		struct block_mode_desc
+		{
+			bool m_dp;
+			uint32_t m_cem;
+			uint32_t m_num_partitions;
+			uint32_t m_grid_x;
+			uint32_t m_grid_y;
+
+			// the coding ISE ranges (which may not be valid ASTC ranges for this configuration)
+			uint32_t m_endpoint_ise_range;
+			uint32_t m_weight_ise_range;
+
+			// the physical/output ASTC decompression ISE ranges (i.e. what the decompressor must output)
+			uint32_t m_transcode_endpoint_ise_range;
+			uint32_t m_transcode_weight_ise_range;
+
+			uint32_t m_flags;
+			int m_dp_channel;
+		};
+
+		// Lack of level flag indicates level 3+
+		const uint32_t BASIST_HDR_6X6_LEVEL0 = 1;
+		const uint32_t BASIST_HDR_6X6_LEVEL1 = 2;
+		const uint32_t BASIST_HDR_6X6_LEVEL2 = 4;
+
+		const uint32_t TOTAL_BLOCK_MODE_DECS = 75;
+		extern const block_mode_desc g_block_mode_descs[TOTAL_BLOCK_MODE_DECS];
+
+		void copy_weight_grid(bool dual_plane, uint32_t grid_x, uint32_t grid_y, const uint8_t* transcode_weights, astc_helpers::log_astc_block& decomp_blk);
+
+		enum class encoding_type
+		{
+			cInvalid = -1,
+			cRun = 0,
+			cSolid = 1,
+			cReuse = 2,
+			cBlock = 3,
+			cTotal
+		};
+
+		const uint32_t REUSE_XY_DELTA_BITS = 5;
+		const uint32_t NUM_REUSE_XY_DELTAS = 1 << REUSE_XY_DELTA_BITS;
+
+		struct reuse_xy_delta
+		{
+			int8_t m_x, m_y;
+		};
+
+		extern const reuse_xy_delta g_reuse_xy_deltas[NUM_REUSE_XY_DELTAS];
+
+		const uint32_t RUN_CODE = 0b000, RUN_CODE_LEN = 3;
+		const uint32_t SOLID_CODE = 0b100, SOLID_CODE_LEN = 3;
+		const uint32_t REUSE_CODE = 0b10, REUSE_CODE_LEN = 2;
+		const uint32_t BLOCK_CODE = 0b1, BLOCK_CODE_LEN = 1;
+
+		enum class endpoint_mode
+		{
+			cInvalid = -1,
+
+			cRaw = 0,
+			cUseLeft,
+			cUseUpper,
+			cUseLeftDelta,
+			cUseUpperDelta,
+
+			cTotal
+		};
+
+		enum class block_mode
+		{
+			cInvalid = -1,
+
+			cBMTotalModes = TOTAL_BLOCK_MODE_DECS
+		};
+
+		const uint32_t NUM_ENDPOINT_DELTA_BITS = 5;
+
+		const uint32_t NUM_UNIQUE_PARTITIONS2 = 521;
+		extern const uint32_t g_part2_unique_index_to_seed[NUM_UNIQUE_PARTITIONS2];
+
+		const uint32_t NUM_UNIQUE_PARTITIONS3 = 333;
+		extern const uint32_t g_part3_unique_index_to_seed[NUM_UNIQUE_PARTITIONS3];
+
+		bool decode_values(basist::bitwise_decoder& decoder, uint32_t total_values, uint32_t ise_range, uint8_t* pValues);
+
+		void requantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_ise_vals, uint32_t to_ise_range);
+
+		void requantize_ise_endpoints(uint32_t cem, uint32_t src_ise_endpoint_range, const uint8_t* pSrc_endpoints, uint32_t dst_ise_endpoint_range, uint8_t* pDst_endpoints);
+
+		const uint32_t BC6H_NUM_DIFF_ENDPOINT_MODES_TO_TRY_2 = 2;
+		const uint32_t BC6H_NUM_DIFF_ENDPOINT_MODES_TO_TRY_4 = 4;
+		const uint32_t BC6H_NUM_DIFF_ENDPOINT_MODES_TO_TRY_9 = 9;
+
+		struct fast_bc6h_params
+		{
+			uint32_t m_num_diff_endpoint_modes_to_try;
+			uint32_t m_max_2subset_pats_to_try;
+
+			bool m_hq_ls;
+			bool m_brute_force_weight4_assignment;
+			
+			fast_bc6h_params()
+			{
+				init();
+			}
+
+			void init()
+			{
+				m_hq_ls = true;
+				m_num_diff_endpoint_modes_to_try = BC6H_NUM_DIFF_ENDPOINT_MODES_TO_TRY_2;
+				m_max_2subset_pats_to_try = 1;
+				m_brute_force_weight4_assignment = false;
+			}
+		};
+
+		void fast_encode_bc6h(const basist::half_float* pPixels, basist::bc6h_block* pBlock, const fast_bc6h_params &params);
+
+		bool decode_6x6_hdr(const uint8_t* pComp_data, uint32_t comp_data_size, basisu::vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t& width, uint32_t& height);
+
+	} // namespace astc_6x6_hdr
+
 } // namespace basist
+

+ 208 - 80
thirdparty/basis_universal/transcoder/basisu_astc_helpers.h

@@ -15,6 +15,7 @@ namespace astc_helpers
 	const uint32_t MIN_GRID_DIM = 2; // the minimum dimension of a block's weight grid
 	const uint32_t MIN_BLOCK_DIM = 4, MAX_BLOCK_DIM = 12; // the valid block dimensions in texels
 	const uint32_t MAX_GRID_WEIGHTS = 64; // a block may have a maximum of 64 weight grid values
+	const uint32_t NUM_MODE11_ENDPOINTS = 6, NUM_MODE7_ENDPOINTS = 4;
 
 	static const uint32_t NUM_ASTC_BLOCK_SIZES = 14;
 	extern const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2];
@@ -108,25 +109,30 @@ namespace astc_helpers
 		bool m_error_flag;
 		
 		bool m_solid_color_flag_ldr, m_solid_color_flag_hdr;
-		uint16_t m_solid_color[4];
 
+		uint8_t m_user_mode;					// user defined value, not used in this module
+		
 		// Rest is only valid if !m_solid_color_flag_ldr && !m_solid_color_flag_hdr
-		uint32_t m_grid_width, m_grid_height;	// weight grid dimensions, not the dimension of the block
+		uint8_t m_grid_width, m_grid_height;	// weight grid dimensions, not the dimension of the block
 		
 		bool m_dual_plane;
 
-		uint32_t m_weight_ise_range;			// 0-11
-		uint32_t m_endpoint_ise_range;			// 4-20, this is actually inferred from the size of the other config bits+weights, but this is here for checking
+		uint8_t m_weight_ise_range;				// 0-11
+		uint8_t m_endpoint_ise_range;			// 4-20, this is actually inferred from the size of the other config bits+weights, but this is here for checking
 
-		uint32_t m_color_component_selector;	// 0-3, 0=GBA R, 1=RBA G, 2=RGA B, 3=RGB A, only used in dual plane mode
+		uint8_t m_color_component_selector;	// 0-3, controls which channel uses the 2nd (odd) weights, only used in dual plane mode
 
-		uint32_t m_num_partitions;				// or the # of subsets, 1-4 (1-3 if dual plane mode)
-		uint32_t m_partition_id;				// 10-bits, must be 0 if m_num_partitions==1
+		uint8_t m_num_partitions;				// or the # of subsets, 1-4 (1-3 if dual plane mode)
+		uint16_t m_partition_id;				// 10-bits, must be 0 if m_num_partitions==1
 		
-		uint32_t m_color_endpoint_modes[MAX_PARTITIONS]; // each subset's CEM's
+		uint8_t m_color_endpoint_modes[MAX_PARTITIONS]; // each subset's CEM's
 		
-		// ISE weight grid values. In dual plane mode, the order is p0,p1,  p0,p1,  etc.
-		uint8_t m_weights[MAX_GRID_WEIGHTS];
+		union
+		{
+			// ISE weight grid values. In dual plane mode, the order is p0,p1,  p0,p1,  etc.
+			uint8_t m_weights[MAX_GRID_WEIGHTS];
+			uint16_t m_solid_color[4];
+		};
 		
 		// ISE endpoint values
 		// Endpoint order examples:
@@ -137,7 +143,7 @@ namespace astc_helpers
 		// 2 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0 RL1 RH1 GL1 GH1 BL1 BH1
 		// 2 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0 RL1 RH1 GL1 GH1 BL1 BH1 AL1 AH1
 		uint8_t m_endpoints[MAX_ENDPOINTS];
-
+				
 		void clear()
 		{
 			memset(this, 0, sizeof(*this));
@@ -169,7 +175,7 @@ namespace astc_helpers
 
 	inline int get_ise_sequence_bits(int count, int range)
 	{
-		// See 18.22 Data Size Determination
+		// See 18.22 Data Size Determination - note this will be <= the # of bits actually written by encode_bise(). (It's magic.)
 		int total_bits = g_ise_range_table[range][0] * count;
 		total_bits += (g_ise_range_table[range][1] * 8 * count + 4) / 5;
 		total_bits += (g_ise_range_table[range][2] * 7 * count + 2) / 3;
@@ -182,16 +188,26 @@ namespace astc_helpers
 		return (l * (64 - w) + h * w + 32) >> 6;
 	}
 
-	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range);
+	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range, uint32_t *pStats = nullptr);
+
+	struct pack_stats
+	{
+		uint32_t m_header_bits;
+		uint32_t m_endpoint_bits;
+		uint32_t m_weight_bits;
+
+		inline pack_stats() { clear(); }
+		inline void clear() { memset(this, 0, sizeof(*this)); }
+	};
 
 	// Packs a logical to physical ASTC block. Note this does not validate the block's dimensions (use is_valid_block_size()), just the grid dimensions.
-	bool pack_astc_block(astc_block &phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range = nullptr);
+	bool pack_astc_block(astc_block &phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range = nullptr, pack_stats *pStats = nullptr);
 
 	// Pack LDR void extent (really solid color) blocks. For LDR, pass in (val | (val << 8)) for each component.
-	void pack_void_extent_ldr(astc_block& blk, uint16_t r, uint16_t g, uint16_t b, uint16_t a);
+	void pack_void_extent_ldr(astc_block& blk, uint16_t r, uint16_t g, uint16_t b, uint16_t a, pack_stats *pStats = nullptr);
 
 	// Pack HDR void extent (16-bit values are FP16/half floats - no NaN/Inf's)
-	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah);
+	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats* pStats = nullptr);
 
 	// These helpers are all quite slow, but are useful for table preparation.
 	
@@ -299,6 +315,24 @@ namespace astc_helpers
 
 	extern dequant_tables g_dequant_tables;
 	void init_tables(bool init_rank_tabs);
+
+	struct weighted_sample
+	{
+		uint8_t m_src_x;
+		uint8_t m_src_y;
+		uint8_t m_weights[2][2]; // [y][x], scaled by 16, round by adding 8
+	};
+
+	void compute_upsample_weights(
+		int block_width, int block_height,
+		int weight_grid_width, int weight_grid_height,
+		weighted_sample* pWeights); // there will be block_width * block_height bilinear samples
+
+	void upsample_weight_grid(
+		uint32_t bx, uint32_t by,		// destination/to dimension
+		uint32_t wx, uint32_t wy,		// source/from dimension
+		const uint8_t* pSrc_weights,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
+		uint8_t* pDst_weights);			// [by][bx]
 		
 	// Procedurally returns the texel partition/subset index given the block coordinate and config.
 	int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block);
@@ -315,6 +349,27 @@ namespace astc_helpers
 	half_float float_to_half(float val, bool toward_zero);
 	float half_to_float(half_float hval);
 
+	// Notes:
+	// qlog16_to_half(half_to_qlog16(half_val_as_int)) == half_val_as_int (is lossless)
+	// However, this is not lossless in the general sense.
+	inline half_float qlog16_to_half(int k)
+	{
+		assert((k >= 0) && (k <= 0xFFFF));
+
+		int E = (k & 0xF800) >> 11;
+		int M = k & 0x7FF;
+
+		int Mt;
+		if (M < 512)
+			Mt = 3 * M;
+		else if (M >= 1536)
+			Mt = 5 * M - 2048;
+		else
+			Mt = 4 * M - 512;
+
+		return (half_float)((E << 10) + (Mt >> 3));
+	}
+
 	const int MAX_RGB9E5 = 0xff80;
 	void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b);
 	uint32_t pack_rgb9e5(float r, float g, float b);
@@ -437,7 +492,7 @@ namespace astc_helpers
 	};
 
 	// Encodes 3 values to output, usable for any range that uses quints and bits
-	static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n)
+	static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n, uint32_t* pStats)
 	{
 		// First extract the quints and the bits from the 3 input values
 		int quints = 0, bits[3];
@@ -461,6 +516,9 @@ namespace astc_helpers
 		// Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96.
 		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) |
 			(bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3);
+
+		if (pStats)
+			*pStats += n * 3 + 7;
 	}
 
 	static const uint8_t g_astc_trit_encode[243] = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 16, 17, 18, 20, 21, 22, 24, 25, 26, 3, 7, 11, 19, 23, 27, 12, 13, 14, 32, 33, 34, 36, 37, 38, 40, 41, 42, 48, 49, 50, 52, 53, 54, 56, 57, 58, 35, 39,
@@ -471,7 +529,7 @@ namespace astc_helpers
 		191, 223, 124, 125, 126 };
 
 	// Encodes 5 values to output, usable for any range that uses trits and bits
-	static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n)
+	static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n, uint32_t *pStats)
 	{
 		// First extract the trits and the bits from the 5 input values
 		int trits = 0, bits[5];
@@ -494,13 +552,16 @@ namespace astc_helpers
 
 		// Now interleave the 8 encoded trit bits with the bits to form the encoded output. See table 94.
 		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 1) << n) | (bits[1] << (2 + n)), n * 2 + 2);
-
+		
 		astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) |
 			(bits[4] << (5 + n * 2)) | (astc_extract_bits(T, 7, 7) << (5 + n * 3)), n * 3 + 6);
+		
+		if (pStats)
+			*pStats += n * 5 + 8;
 	}
 
 	// Packs values using ASTC's BISE to output buffer.
-	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range)
+	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range, uint32_t *pStats)
 	{
 		uint32_t temp[5] = { 0 };
 
@@ -533,19 +594,23 @@ namespace astc_helpers
 				for (int i = 0; i < limit; i++)
 					vals[i] = pSrc_vals[group_index * group_size + i];
 
+				// Note this always writes a group of 3 or 5 bits values, even for incomplete groups. So it can write more than needed. 
+				// get_ise_sequence_bits() returns the # of bits that must be written for proper decoding.
 				if (group_size == 5)
-					astc_encode_trits(temp, vals, bit_pos, num_bits);
+					astc_encode_trits(temp, vals, bit_pos, num_bits, pStats);
 				else
-					astc_encode_quints(temp, vals, bit_pos, num_bits);
+					astc_encode_quints(temp, vals, bit_pos, num_bits, pStats);
 			}
 		}
 		else
 		{
 			for (int i = 0; i < num_vals; i++)
 				astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits);
+
+			if (pStats)
+				*pStats += num_vals * num_bits;
 		}
 
-		// TODO: Could this write too many bits on incomplete blocks?
 		pDst[0] |= temp[0]; pDst[1] |= temp[1];
 		pDst[2] |= temp[2]; pDst[3] |= temp[3];
 	}
@@ -652,7 +717,7 @@ namespace astc_helpers
 		return false;
 	}
 
-	bool pack_astc_block(astc_block& phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range)
+	bool pack_astc_block(astc_block& phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range, pack_stats *pStats)
 	{
 		memset(&phys_block, 0, sizeof(phys_block));
 
@@ -665,12 +730,12 @@ namespace astc_helpers
 				
 		if (log_block.m_solid_color_flag_ldr)
 		{
-			pack_void_extent_ldr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3]);
+			pack_void_extent_ldr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3], pStats);
 			return true;
 		}
 		else if (log_block.m_solid_color_flag_hdr)
 		{
-			pack_void_extent_hdr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3]);
+			pack_void_extent_hdr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3], pStats);
 			return true;
 		}
 				
@@ -687,6 +752,8 @@ namespace astc_helpers
 
 		if (log_block.m_color_component_selector > 3)
 			return false;
+
+		// TODO: sanity check grid width/height vs. block's physical width/height
 				
 		uint32_t config_bits = 0;
 		if (!get_config_bits(log_block, config_bits))
@@ -694,6 +761,8 @@ namespace astc_helpers
 
 		uint32_t bit_pos = 0;
 		astc_set_bits(&phys_block.m_vals[0], bit_pos, config_bits, 11);
+		if (pStats)
+			pStats->m_header_bits += 11;
 
 		const uint32_t total_grid_weights = (log_block.m_dual_plane ? 2 : 1) * (log_block.m_grid_width * log_block.m_grid_height);
 		const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range);
@@ -705,6 +774,8 @@ namespace astc_helpers
 		uint32_t total_extra_bits = 0;
 
 		astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_num_partitions - 1, 2);
+		if (pStats)
+			pStats->m_header_bits += 2;
 
 		if (log_block.m_num_partitions > 1)
 		{
@@ -712,12 +783,14 @@ namespace astc_helpers
 				return false;
 
 			astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_partition_id, 10);
+			if (pStats)
+				pStats->m_header_bits += 10;
 
 			uint32_t highest_cem = 0, lowest_cem = UINT32_MAX;
 			for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
 			{
-				highest_cem = my_max(highest_cem, log_block.m_color_endpoint_modes[j]);
-				lowest_cem = my_min(lowest_cem, log_block.m_color_endpoint_modes[j]);
+				highest_cem = my_max<uint32_t>(highest_cem, log_block.m_color_endpoint_modes[j]);
+				lowest_cem = my_min<uint32_t>(lowest_cem, log_block.m_color_endpoint_modes[j]);
 			}
 
 			if (highest_cem > 15)
@@ -752,9 +825,13 @@ namespace astc_helpers
 
 				uint32_t cem_bit_pos = 128 - total_weight_bits - total_extra_bits;
 				astc_set_bits(&phys_block.m_vals[0], cem_bit_pos, encoded_cem >> 6, total_extra_bits);
+				if (pStats)
+					pStats->m_header_bits += total_extra_bits;
 			}
 
 			astc_set_bits(&phys_block.m_vals[0], bit_pos, encoded_cem & 0x3f, 6);
+			if (pStats)
+				pStats->m_header_bits += 6;
 		}
 		else
 		{
@@ -764,6 +841,8 @@ namespace astc_helpers
 				return false;
 
 			astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_color_endpoint_modes[0], 4);
+			if (pStats)
+				pStats->m_header_bits += 4;
 		}
 
 		if (log_block.m_dual_plane)
@@ -775,6 +854,8 @@ namespace astc_helpers
 			
 			uint32_t ccs_bit_pos = 128 - (int)total_weight_bits - (int)total_extra_bits;
 			astc_set_bits(&phys_block.m_vals[0], ccs_bit_pos, log_block.m_color_component_selector, 2);
+			if (pStats)
+				pStats->m_header_bits += 2;
 		}
 
 		const uint32_t total_config_bits = bit_pos + total_extra_bits;
@@ -812,6 +893,12 @@ namespace astc_helpers
 			return false;
 		}
 
+		if (pStats)
+		{
+			pStats->m_endpoint_bits += get_ise_sequence_bits(total_cem_vals, endpoint_ise_range);
+			pStats->m_weight_bits += get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range);
+		}
+
 		// Pack endpoints forwards
 		encode_bise(&phys_block.m_vals[0], log_block.m_endpoints, bit_pos, total_cem_vals, endpoint_ise_range);
 		
@@ -1210,7 +1297,7 @@ namespace astc_helpers
 		}
 	}
 
-	void pack_void_extent_ldr(astc_block &blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah)
+	void pack_void_extent_ldr(astc_block &blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats* pStats)
 	{
 		uint8_t* pDst = (uint8_t*)&blk.m_vals[0];
 		memset(pDst, 0xFF, 16);
@@ -1226,10 +1313,13 @@ namespace astc_helpers
 		pDst[13] = (uint8_t)(bh >> 8);
 		pDst[14] = (uint8_t)ah;
 		pDst[15] = (uint8_t)(ah >> 8);
+
+		if (pStats)
+			pStats->m_header_bits += 128;
 	}
 
 	// rh-ah are half-floats
-	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah) 
+	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats *pStats) 
 	{
 		uint8_t* pDst = (uint8_t*)&blk.m_vals[0];
 		memset(pDst, 0xFF, 16);
@@ -1244,6 +1334,9 @@ namespace astc_helpers
 		pDst[13] = (uint8_t)(bh >> 8);
 		pDst[14] = (uint8_t)ah;
 		pDst[15] = (uint8_t)(ah >> 8);
+
+		if (pStats)
+			pStats->m_header_bits += 128;
 	}
 		
 	bool is_cem_ldr(uint32_t mode)
@@ -1323,22 +1416,17 @@ namespace astc_helpers
 	dequant_tables g_dequant_tables;
 
 	void precompute_texel_partitions_4x4();
+	void precompute_texel_partitions_6x6();
 
 	void init_tables(bool init_rank_tabs)
 	{
 		g_dequant_tables.init(init_rank_tabs);
 		
 		precompute_texel_partitions_4x4();
+		precompute_texel_partitions_6x6();
 	}
-
-	struct weighted_sample
-	{
-		uint8_t m_src_x;
-		uint8_t m_src_y;
-		uint8_t m_weights[2][2]; // [y][x], scaled by 16, round by adding 8
-	};
-
-	static void compute_upsample_weights(
+		
+	void compute_upsample_weights(
 		int block_width, int block_height,
 		int weight_grid_width, int weight_grid_height,
 		weighted_sample* pWeights) // there will be block_width * block_height bilinear samples
@@ -1373,7 +1461,7 @@ namespace astc_helpers
 	}
 
 	// Should be dequantized [0,64] weights
-	static void upsample_weight_grid(
+	void upsample_weight_grid(
 		uint32_t bx, uint32_t by,		// destination/to dimension
 		uint32_t wx, uint32_t wy,		// source/from dimension
 		const uint8_t* pSrc_weights,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
@@ -1429,6 +1517,7 @@ namespace astc_helpers
 		return p;
 	}
 
+	// small_block = num_blk_pixels < 31
 	int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block)
 	{
 		assert(zIn == 0);
@@ -1495,7 +1584,11 @@ namespace astc_helpers
 			: 3;
 	}
 
-	static uint32_t g_texel_partitions_4x4[1024][2];
+	// 4x4, 2 and 3 subsets
+	static uint32_t g_texel_partitions_4x4[1024][2]; 
+	
+	// 6x6, 2 and 3 subsets (2 subsets low 4 bits, 3 subsets high 4 bits)
+	static uint8_t g_texel_partitions_6x6[1024][6 * 6];
 
 	void precompute_texel_partitions_4x4()
 	{
@@ -1518,6 +1611,24 @@ namespace astc_helpers
 		}
 	}
 
+	void precompute_texel_partitions_6x6()
+	{
+		for (uint32_t p = 0; p < 1024; p++)
+		{
+			for (uint32_t y = 0; y < 6; y++)
+			{
+				for (uint32_t x = 0; x < 6; x++)
+				{
+					const uint32_t p2 = compute_texel_partition(p, x, y, 0, 2, false);
+					const uint32_t p3 = compute_texel_partition(p, x, y, 0, 3, false);
+					
+					assert((p2 <= 1) && (p3 <= 2));
+					g_texel_partitions_6x6[p][x + y * 6] = (uint8_t)((p3 << 4) | p2);
+				}
+			}
+		}
+	}
+
 	static inline int get_precompute_texel_partitions_4x4(uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions)
 	{
 		assert(g_texel_partitions_4x4[1][0]);
@@ -1529,6 +1640,17 @@ namespace astc_helpers
 		return (g_texel_partitions_4x4[seed][num_partitions - 2] >> shift) & 3;
 	}
 
+	static inline int get_precompute_texel_partitions_6x6(uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions)
+	{
+		assert(g_texel_partitions_6x6[0][0]);
+		assert(seed < 1024);
+		assert((x <= 5) && (y <= 5));
+		assert((num_partitions >= 2) && (num_partitions <= 3));
+
+		const uint32_t shift = (num_partitions == 3) ? 4 : 0;
+		return (g_texel_partitions_6x6[seed][x + y * 6] >> shift) & 3;
+	}
+
 	void blue_contract(
 		int r, int g, int b, int a, 
 		int &dr, int &dg, int &db, int &da)
@@ -2144,25 +2266,7 @@ namespace astc_helpers
 		x.u = m | (e << 23) | (s << 31);
 		return x.f;
 	}
-
-	static inline half_float qlog16_to_half(int k)
-	{
-		assert((k >= 0) && (k <= 0xFFFF));
-
-		int E = (k & 0xF800) >> 11;
-		int M = k & 0x7FF;
-
-		int Mt;
-		if (M < 512)
-			Mt = 3 * M;
-		else if (M >= 1536)
-			Mt = 5 * M - 2048;
-		else
-			Mt = 4 * M - 512;
-
-		return (half_float)((E << 10) + (Mt >> 3));
-	}
-
+		
 	// See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
 	const int RGB9E5_EXPONENT_BITS = 5, RGB9E5_MANTISSA_BITS = 9, RGB9E5_EXP_BIAS = 15, RGB9E5_MAX_VALID_BIASED_EXP = 31;
 	const int MAX_RGB9E5_EXP = (RGB9E5_MAX_VALID_BIASED_EXP - RGB9E5_EXP_BIAS);
@@ -2514,7 +2618,8 @@ namespace astc_helpers
 
 		// Decode texels
 		const bool small_block = num_blk_pixels < 31;
-		const bool use_precomputed_texel_partitions = (blk_width == 4) && (blk_height == 4) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3);
+		const bool use_precomputed_texel_partitions_4x4 = (blk_width == 4) && (blk_height == 4) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3);
+		const bool use_precomputed_texel_partitions_6x6 = (blk_width == 6) && (blk_height == 6) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3);
 		const uint32_t ccs = log_blk.m_dual_plane ? log_blk.m_color_component_selector : UINT32_MAX;
 		
 		bool success = true;
@@ -2527,9 +2632,17 @@ namespace astc_helpers
 				for (uint32_t x = 0; x < blk_width; x++)
 				{
 					const uint32_t pixel_index = x + y * blk_width;
-					const uint32_t subset = (log_blk.m_num_partitions > 1) ? 
-						(use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block))
-						: 0;
+					
+					uint32_t subset = 0;
+					if (log_blk.m_num_partitions > 1)
+					{
+						if (use_precomputed_texel_partitions_4x4)
+							subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
+						else if (use_precomputed_texel_partitions_6x6)
+							subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
+						else
+							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
+					}
 
 					int comp[3];
 
@@ -2592,9 +2705,17 @@ namespace astc_helpers
 				for (uint32_t x = 0; x < blk_width; x++)
 				{
 					const uint32_t pixel_index = x + y * blk_width;
-					const uint32_t subset = (log_blk.m_num_partitions > 1) ?
-						(use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block))
-						: 0;
+					
+					uint32_t subset = 0;
+					if (log_blk.m_num_partitions > 1)
+					{
+						if (use_precomputed_texel_partitions_4x4)
+							subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
+						else if (use_precomputed_texel_partitions_6x6)
+							subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
+						else
+							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
+					}
 
 					for (uint32_t c = 0; c < 4; c++)
 					{
@@ -2653,9 +2774,16 @@ namespace astc_helpers
 				{
 					const uint32_t pixel_index = x + y * blk_width;
 
-					const uint32_t subset = (log_blk.m_num_partitions > 1) ?
-						(use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block))
-						: 0;
+					uint32_t subset = 0;
+					if (log_blk.m_num_partitions > 1)
+					{
+						if (use_precomputed_texel_partitions_4x4)
+							subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
+						else if (use_precomputed_texel_partitions_6x6)
+							subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
+						else
+							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
+					}
 
 					if (!is_ldr_endpoints[subset])
 					{
@@ -3235,10 +3363,10 @@ namespace astc_helpers
 		if (p < 2)
 			return false;
 		
-		log_blk.m_grid_width = W;
-		log_blk.m_grid_height = H;
+		log_blk.m_grid_width = (uint8_t)W;
+		log_blk.m_grid_height = (uint8_t)H;
 		
-		log_blk.m_weight_ise_range = (p - 2) + (P * BISE_10_LEVELS);
+		log_blk.m_weight_ise_range = (uint8_t)((p - 2) + (P * BISE_10_LEVELS));
 		assert(log_blk.m_weight_ise_range <= LAST_VALID_WEIGHT_ISE_RANGE);
 
 		log_blk.m_dual_plane = Dp;
@@ -3441,16 +3569,16 @@ namespace astc_helpers
 
 		// Right before the weight bits, there may be extra CEM bits, then the 2 CCS bits if dual plane.
 
-		log_blk.m_num_partitions = bits.get_bits(11, 2) + 1;
+		log_blk.m_num_partitions = (uint8_t)(bits.get_bits(11, 2) + 1);
 		if (log_blk.m_num_partitions == 1)
-			log_blk.m_color_endpoint_modes[0] = bits.get_bits(13, 4); // read CEM bits
+			log_blk.m_color_endpoint_modes[0] = (uint8_t)(bits.get_bits(13, 4)); // read CEM bits
 		else
 		{
 			// 2 or more partitions
 			if (log_blk.m_dual_plane && (log_blk.m_num_partitions == 4))
 				return false;
 
-			log_blk.m_partition_id = bits.get_bits(13, 10);
+			log_blk.m_partition_id = (uint16_t)bits.get_bits(13, 10);
 
 			uint32_t cem_bits = bits.get_bits(23, 6);
 
@@ -3458,7 +3586,7 @@ namespace astc_helpers
 			{
 				// All CEM's the same
 				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
-					log_blk.m_color_endpoint_modes[i] = cem_bits >> 2;
+					log_blk.m_color_endpoint_modes[i] = (uint8_t)(cem_bits >> 2);
 			}
 			else
 			{
@@ -3511,7 +3639,7 @@ namespace astc_helpers
 
 				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
 				{
-					log_blk.m_color_endpoint_modes[i] = first_cem_index + (c[i] * 4) + m[i];
+					log_blk.m_color_endpoint_modes[i] = (uint8_t)(first_cem_index + (c[i] * 4) + m[i]);
 					assert(log_blk.m_color_endpoint_modes[i] <= 15);
 				}
 			}
@@ -3528,7 +3656,7 @@ namespace astc_helpers
 				return false;
 
 			uint32_t ccs_bit_pos = end_of_weight_bit_ofs - total_extra_bits;
-			log_blk.m_color_component_selector = bits.get_bits(ccs_bit_pos, 2);
+			log_blk.m_color_component_selector = (uint8_t)(bits.get_bits(ccs_bit_pos, 2));
 		}
 
 		uint32_t config_bit_pos = 11 + 2; // config+num_parts
@@ -3569,7 +3697,7 @@ namespace astc_helpers
 		if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE)
 			return false;
 
-		log_blk.m_endpoint_ise_range = endpoint_ise_range;
+		log_blk.m_endpoint_ise_range = (uint8_t)endpoint_ise_range;
 
 		// Decode endpoints forwards in block
 		decode_bise(log_blk.m_endpoint_ise_range, log_blk.m_endpoints, total_cem_vals, bits, config_bit_pos);

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 2152 - 693
thirdparty/basis_universal/transcoder/basisu_containers.h


+ 778 - 280
thirdparty/basis_universal/transcoder/basisu_containers_impl.h

@@ -7,308 +7,806 @@
 
 namespace basisu
 {
-   bool elemental_vector::increase_capacity(uint32_t min_new_capacity, bool grow_hint, uint32_t element_size, object_mover pMover, bool nofail)
-   {
-      assert(m_size <= m_capacity);
-
-      if (sizeof(void *) == sizeof(uint64_t))
-         assert(min_new_capacity < (0x400000000ULL / element_size));
-      else
-         assert(min_new_capacity < (0x7FFF0000U / element_size));
-
-      if (m_capacity >= min_new_capacity)
-         return true;
-
-      uint64_t new_capacity_u64 = min_new_capacity;
-      if ((grow_hint) && (!helpers::is_power_of_2(new_capacity_u64)))
-          new_capacity_u64 = helpers::next_pow2(new_capacity_u64);
-
-      size_t new_capacity = (size_t)new_capacity_u64;
-      if (new_capacity != new_capacity_u64)
-      {
-          if (nofail)
-              return false;
-          fprintf(stderr, "elemental_vector::increase_capacity: vector too large\n");
-          abort();
-      }
-            
-      const uint64_t desired_size_u64 = (uint64_t)element_size * new_capacity;
-            
-      const size_t desired_size = (size_t)desired_size_u64;
-      if (desired_size_u64 != desired_size)
-      {
-          if (nofail)
-              return false;
-          fprintf(stderr, "elemental_vector::increase_capacity: vector too large\n");
-          abort();
-      }
-
-      size_t actual_size = 0;
-      if (!pMover)
-      {
-         void* new_p = realloc(m_p, desired_size);
-         if (!new_p)
-         {
-            if (nofail)
-               return false;
-
-            char buf[256];
-            snprintf(buf, sizeof(buf), "elemental_vector::increase_capacity: realloc() failed allocating %zu bytes", desired_size);
-            fprintf(stderr, "%s", buf);
-            abort();
-         }
+	// A container operation has internally panicked in an unrecoverable way.
+	// Either an allocation has failed, or a range or consistency check has failed.
+#ifdef _MSC_VER
+	__declspec(noreturn)
+#else
+	[[noreturn]] 
+#endif
+	void container_abort(const char* pMsg, ...)
+	{
+		assert(0);
+
+		va_list args;
+		va_start(args, pMsg);
+
+		char buf[1024] = {};
+
+#ifdef _MSC_VER
+		vsprintf_s(buf, sizeof(buf), pMsg, args);
+#else
+		vsnprintf(buf, sizeof(buf), pMsg, args);
+#endif
+		va_end(args);
+
+		fputs(buf, stderr);
+
+		std::terminate();
+	}
+
+	bool elemental_vector::increase_capacity(size_t min_new_capacity, bool grow_hint, size_t element_size, object_mover pMover, bool nofail_flag)
+	{
+		assert(m_size <= m_capacity);
+		assert(min_new_capacity >= m_size);
+		assert(element_size);
+		
+		// Basic sanity check min_new_capacity
+		if (!can_fit_into_size_t((uint64_t)min_new_capacity * element_size))
+		{
+			assert(0);
+			
+			if (nofail_flag)
+				return false;
+
+			container_abort("elemental_vector::increase_capacity: requesting too many elements\n");
+		}
+
+		// Check for sane library limits
+		if (sizeof(void*) == sizeof(uint64_t))
+		{
+			// 16 GB
+			assert(min_new_capacity < (0x400000000ULL / element_size));
+		}
+		else
+		{
+			// ~1.99 GB
+			assert(min_new_capacity < (0x7FFF0000U / element_size));
+		}
+
+		// If vector is already large enough just return.
+		if (m_capacity >= min_new_capacity)
+			return true;
+
+		uint64_t new_capacity_u64 = min_new_capacity;
+
+		if ((grow_hint) && (!helpers::is_power_of_2(new_capacity_u64)))
+		{
+			new_capacity_u64 = helpers::next_pow2(new_capacity_u64);
+
+			if (!can_fit_into_size_t(new_capacity_u64))
+			{
+				assert(0);
+
+				if (nofail_flag)
+					return false;
+
+				container_abort("elemental_vector::increase_capacity: vector too large\n");
+			}
+		}
+
+		const uint64_t desired_size_u64 = element_size * new_capacity_u64;
+
+		if (!can_fit_into_size_t(desired_size_u64))
+		{
+			assert(0);
+
+			if (nofail_flag)
+				return false;
+
+			container_abort("elemental_vector::increase_capacity: vector too large\n");
+		}
+
+		const size_t desired_size = static_cast<size_t>(desired_size_u64);
+						
+		size_t actual_size = 0;
+		BASISU_NOTE_UNUSED(actual_size);
+
+		if (!pMover)
+		{
+			void* new_p = realloc(m_p, desired_size);
+			if (!new_p)
+			{
+				assert(0);
+
+				if (nofail_flag)
+					return false;
+
+				container_abort("elemental_vector::increase_capacity: realloc() failed allocating %zu bytes", desired_size);
+			}
 
 #if BASISU_VECTOR_DETERMINISTIC
-         actual_size = desired_size;
+			actual_size = desired_size;
 #elif defined(_MSC_VER)
-         actual_size = _msize(new_p);
+			actual_size = _msize(new_p);
 #elif HAS_MALLOC_USABLE_SIZE
-         actual_size = malloc_usable_size(new_p);
+			actual_size = malloc_usable_size(new_p);
 #else
-         actual_size = desired_size;
+			actual_size = desired_size;
 #endif
-         m_p = new_p;
-      }
-      else
-      {
-         void* new_p = malloc(desired_size);
-         if (!new_p)
-         {
-            if (nofail)
-               return false;
-
-            char buf[256];
-            snprintf(buf, sizeof(buf), "elemental_vector::increase_capacity: malloc() failed allocating %zu bytes", desired_size);
-            fprintf(stderr, "%s", buf);
-            abort();
-         }
+			m_p = new_p;
+		}
+		else
+		{
+			void* new_p = malloc(desired_size);
+			if (!new_p)
+			{
+				assert(0);
+				if (nofail_flag)
+					return false;
+
+				container_abort("elemental_vector::increase_capacity: malloc() failed allocating %zu bytes", desired_size);
+			}
 
 #if BASISU_VECTOR_DETERMINISTIC
-         actual_size = desired_size;
+			actual_size = desired_size;
 #elif defined(_MSC_VER)
-         actual_size = _msize(new_p);
+			actual_size = _msize(new_p);
 #elif HAS_MALLOC_USABLE_SIZE
-         actual_size = malloc_usable_size(new_p);
+			actual_size = malloc_usable_size(new_p);
 #else
-         actual_size = desired_size;
+			actual_size = desired_size;
 #endif
 
-         (*pMover)(new_p, m_p, m_size);
+			(*pMover)(new_p, m_p, m_size);
 
-         if (m_p)
-            free(m_p);
-         
-         m_p = new_p;
-      }
+			if (m_p)
+				free(m_p);
 
-      if (actual_size > desired_size)
-         m_capacity = static_cast<uint32_t>(actual_size / element_size);
-      else
-         m_capacity = static_cast<uint32_t>(new_capacity);
+			m_p = new_p;
+		}
 
-      return true;
-   }
+#if BASISU_VECTOR_DETERMINISTIC
+		m_capacity = static_cast<size_t>(new_capacity_u64);
+#else
+		if (actual_size > desired_size)
+			m_capacity = static_cast<size_t>(actual_size / element_size);
+		else
+			m_capacity = static_cast<size_t>(new_capacity_u64);
+#endif
+
+		return true;
+	}
 
 #if BASISU_HASHMAP_TEST
 
 #define HASHMAP_TEST_VERIFY(c) do { if (!(c)) handle_hashmap_test_verify_failure(__LINE__); } while(0)
 
-   static void handle_hashmap_test_verify_failure(int line)
-   {
-      fprintf(stderr, "HASHMAP_TEST_VERIFY() faild on line %i\n", line);
-      abort();
-   }
-
-   class counted_obj
-   {
-   public:
-      counted_obj(uint32_t v = 0) :
-         m_val(v)
-      {
-         m_count++;
-      }
-
-      counted_obj(const counted_obj& obj) :
-         m_val(obj.m_val)
-      {
-         m_count++;
-      }
-
-      ~counted_obj()
-      {
-         assert(m_count > 0);
-         m_count--;
-      }
-
-      static uint32_t m_count;
-
-      uint32_t m_val;
-
-      operator size_t() const { return m_val; }
-
-      bool operator== (const counted_obj& rhs) const { return m_val == rhs.m_val; }
-      bool operator== (const uint32_t rhs) const { return m_val == rhs; }
-
-   };
-
-   uint32_t counted_obj::m_count;
-
-   static uint32_t urand32()
-   {
-      uint32_t a = rand();
-      uint32_t b = rand() << 15;
-      uint32_t c = rand() << (32 - 15);
-      return a ^ b ^ c;
-   }
-
-   static int irand32(int l, int h)
-   {
-      assert(l < h);
-      if (l >= h)
-         return l;
-
-      uint32_t range = static_cast<uint32_t>(h - l);
-
-      uint32_t rnd = urand32();
-
-      uint32_t rnd_range = static_cast<uint32_t>((((uint64_t)range) * ((uint64_t)rnd)) >> 32U);
-
-      int result = l + rnd_range;
-      assert((result >= l) && (result < h));
-      return result;
-   }
-
-   void hash_map_test()
-   {
-      {
-         basisu::hash_map<uint64_t, uint64_t> k;
-         basisu::hash_map<uint64_t, uint64_t> l;
-         std::swap(k, l);
-
-         k.begin();
-         k.end();
-         k.clear();
-         k.empty();
-         k.erase(0);
-         k.insert(0, 1);
-         k.find(0);
-         k.get_equals();
-         k.get_hasher();
-         k.get_table_size();
-         k.reset();
-         k.reserve(1);
-         k = l;
-         k.set_equals(l.get_equals());
-         k.set_hasher(l.get_hasher());
-         k.get_table_size();
-      }
-
-      uint32_t seed = 0;
-      for (; ; )
-      {
-         seed++;
-
-         typedef basisu::hash_map<counted_obj, counted_obj> my_hash_map;
-         my_hash_map m;
-
-         const uint32_t n = irand32(0, 100000);
-
-         printf("%u\n", n);
-
-         srand(seed); // r1.seed(seed);
-
-         basisu::vector<int> q;
-
-         uint32_t count = 0;
-         for (uint32_t i = 0; i < n; i++)
-         {
-            uint32_t v = urand32() & 0x7FFFFFFF;
-            my_hash_map::insert_result res = m.insert(counted_obj(v), counted_obj(v ^ 0xdeadbeef));
-            if (res.second)
-            {
-               count++;
-               q.push_back(v);
-            }
-         }
-
-         HASHMAP_TEST_VERIFY(m.size() == count);
-
-         srand(seed);
-
-         my_hash_map cm(m);
-         m.clear();
-         m = cm;
-         cm.reset();
-
-         for (uint32_t i = 0; i < n; i++)
-         {
-            uint32_t v = urand32() & 0x7FFFFFFF;
-            my_hash_map::const_iterator it = m.find(counted_obj(v));
-            HASHMAP_TEST_VERIFY(it != m.end());
-            HASHMAP_TEST_VERIFY(it->first == v);
-            HASHMAP_TEST_VERIFY(it->second == (v ^ 0xdeadbeef));
-         }
-
-         for (uint32_t t = 0; t < 2; t++)
-         {
-            const uint32_t nd = irand32(1, q.size() + 1);
-            for (uint32_t i = 0; i < nd; i++)
-            {
-               uint32_t p = irand32(0, q.size());
-
-               int k = q[p];
-               if (k >= 0)
-               {
-                  q[p] = -k - 1;
-
-                  bool s = m.erase(counted_obj(k));
-                  HASHMAP_TEST_VERIFY(s);
-               }
-            }
-
-            typedef basisu::hash_map<uint32_t, empty_type> uint_hash_set;
-            uint_hash_set s;
-
-            for (uint32_t i = 0; i < q.size(); i++)
-            {
-               int v = q[i];
-
-               if (v >= 0)
-               {
-                  my_hash_map::const_iterator it = m.find(counted_obj(v));
-                  HASHMAP_TEST_VERIFY(it != m.end());
-                  HASHMAP_TEST_VERIFY(it->first == (uint32_t)v);
-                  HASHMAP_TEST_VERIFY(it->second == ((uint32_t)v ^ 0xdeadbeef));
-
-                  s.insert(v);
-               }
-               else
-               {
-                  my_hash_map::const_iterator it = m.find(counted_obj(-v - 1));
-                  HASHMAP_TEST_VERIFY(it == m.end());
-               }
-            }
-
-            uint32_t found_count = 0;
-            for (my_hash_map::const_iterator it = m.begin(); it != m.end(); ++it)
-            {
-               HASHMAP_TEST_VERIFY(it->second == ((uint32_t)it->first ^ 0xdeadbeef));
-
-               uint_hash_set::const_iterator fit(s.find((uint32_t)it->first));
-               HASHMAP_TEST_VERIFY(fit != s.end());
-
-               HASHMAP_TEST_VERIFY(fit->first == it->first);
-
-               found_count++;
-            }
-
-            HASHMAP_TEST_VERIFY(found_count == s.size());
-         }
-
-         HASHMAP_TEST_VERIFY(counted_obj::m_count == m.size() * 2);
-      }
-   }
+	static void handle_hashmap_test_verify_failure(int line)
+	{
+		container_abort("HASHMAP_TEST_VERIFY() faild on line %i\n", line);
+	}
+
+	class counted_obj
+	{
+	public:
+		counted_obj(uint32_t v = 0) :
+			m_val(v)
+		{
+			m_count++;
+		}
+
+		counted_obj(const counted_obj& obj) :
+			m_val(obj.m_val)
+		{
+			if (m_val != UINT64_MAX)
+				m_count++;
+		}
+
+		counted_obj(counted_obj&& obj) :
+			m_val(obj.m_val)
+		{
+			obj.m_val = UINT64_MAX;
+		}
+
+		counted_obj& operator= (counted_obj&& rhs)
+		{
+			if (this != &rhs)
+			{
+				m_val = rhs.m_val;
+				rhs.m_val = UINT64_MAX;
+			}
+			return *this;
+		}
+
+		~counted_obj()
+		{
+			if (m_val != UINT64_MAX)
+			{
+				assert(m_count > 0);
+				m_count--;
+			}
+		}
+
+		static uint32_t m_count;
+
+		uint64_t m_val;
+
+		operator size_t() const { return (size_t)m_val; }
+
+		bool operator== (const counted_obj& rhs) const { return m_val == rhs.m_val; }
+		bool operator== (const uint32_t rhs) const { return m_val == rhs; }
+
+	};
+
+	uint32_t counted_obj::m_count;
+
+	static uint32_t urand32()
+	{
+		uint32_t a = rand();
+		uint32_t b = rand() << 15;
+		uint32_t c = rand() << (32 - 15);
+		return a ^ b ^ c;
+	}
+
+	static int irand32(int l, int h)
+	{
+		assert(l < h);
+		if (l >= h)
+			return l;
+
+		uint32_t range = static_cast<uint32_t>(h - l);
+
+		uint32_t rnd = urand32();
+
+		uint32_t rnd_range = static_cast<uint32_t>((((uint64_t)range) * ((uint64_t)rnd)) >> 32U);
+
+		int result = l + rnd_range;
+		assert((result >= l) && (result < h));
+		return result;
+	}
+
+	void hash_map_test()
+	{
+		{
+			basisu::hash_map<uint32_t> s;
+			uint_vec k;
+
+			for (uint32_t i = 0; i < 1000000; i++)
+			{
+				s.insert(i);
+				k.push_back(i);
+			}
+						
+			for (uint32_t i = 0; i < k.size(); i++)
+			{
+				uint32_t r = rand() ^ (rand() << 15);
+
+				uint32_t j = i + (r % (k.size() - i));
+
+				std::swap(k[i], k[j]);
+			}
+
+			basisu::hash_map<uint32_t> s1(s);
+
+			for (uint32_t i = 0; i < 1000000; i++)
+			{
+				auto res = s.find(i);
+				HASHMAP_TEST_VERIFY(res != s.end());
+				HASHMAP_TEST_VERIFY(res->first == i);
+				s.erase(i);
+			}
+
+			for (uint32_t it = 0; it < 1000000; it++)
+			{
+				uint32_t i = k[it];
+
+				auto res = s1.find(i);
+				HASHMAP_TEST_VERIFY(res != s.end());
+				HASHMAP_TEST_VERIFY(res->first == i);
+				s1.erase(i);
+			}
+
+			for (uint32_t i = 0; i < 1000000; i++)
+			{
+				auto res = s.find(i);
+				HASHMAP_TEST_VERIFY(res == s.end());
+
+				auto res1 = s1.find(i);
+				HASHMAP_TEST_VERIFY(res1 == s1.end());
+			}
+
+			HASHMAP_TEST_VERIFY(s.empty());
+			HASHMAP_TEST_VERIFY(s1.empty());
+		}
+
+		{
+			typedef basisu::hash_map< uint32_t, basisu::vector<uint32_t> > hm;
+			hm q;
+			
+			basisu::vector<uint32_t> a, b;
+			a.push_back(1);
+			b.push_back(2);
+			b.push_back(3);
+
+			basisu::vector<uint32_t> c(b);
+
+			hm::insert_result ir;
+			q.try_insert(ir, 1, std::move(a));
+			q.try_insert(ir, 2, std::move(b));
+			q.try_insert(ir, std::make_pair(3, c));
+		}
+
+		{
+			typedef basisu::hash_map<counted_obj, counted_obj> my_hash_map;
+			my_hash_map m;
+			counted_obj a, b;
+			m.insert(std::move(a), std::move(b));
+		}
+
+		{
+			basisu::hash_map<uint64_t, uint64_t> k;
+			basisu::hash_map<uint64_t, uint64_t> l;
+			std::swap(k, l);
+
+			k.begin();
+			k.end();
+			k.clear();
+			k.empty();
+			k.erase(0);
+			k.insert(0, 1);
+			k.find(0);
+			k.get_equals();
+			k.get_hasher();
+			k.get_table_size();
+			k.reset();
+			k.reserve(1);
+			k = l;
+			k.set_equals(l.get_equals());
+			k.set_hasher(l.get_hasher());
+			k.get_table_size();
+		}
+
+		uint32_t seed = 0;
+		for (; ; )
+		{
+			seed++;
+
+			typedef basisu::hash_map<counted_obj, counted_obj> my_hash_map;
+			my_hash_map m;
+
+			const uint32_t n = irand32(1, 100000);
+
+			printf("%u\n", n);
+
+			srand(seed); // r1.seed(seed);
+
+			basisu::vector<int> q;
+
+			uint32_t count = 0;
+			for (uint32_t i = 0; i < n; i++)
+			{
+				uint32_t v = urand32() & 0x7FFFFFFF;
+				my_hash_map::insert_result res = m.insert(counted_obj(v), counted_obj(v ^ 0xdeadbeef));
+				if (res.second)
+				{
+					count++;
+					q.push_back(v);
+				}
+			}
+
+			HASHMAP_TEST_VERIFY(m.size() == count);
+
+			srand(seed);
+
+			my_hash_map cm(m);
+			m.clear();
+			m = cm;
+			cm.reset();
+
+			for (uint32_t i = 0; i < n; i++)
+			{
+				uint32_t v = urand32() & 0x7FFFFFFF;
+				my_hash_map::const_iterator it = m.find(counted_obj(v));
+				HASHMAP_TEST_VERIFY(it != m.end());
+				HASHMAP_TEST_VERIFY(it->first == v);
+				HASHMAP_TEST_VERIFY(it->second == (v ^ 0xdeadbeef));
+			}
+
+			for (uint32_t t = 0; t < 2; t++)
+			{
+				const uint32_t nd = irand32(1, q.size_u32() + 1);
+				for (uint32_t i = 0; i < nd; i++)
+				{
+					uint32_t p = irand32(0, q.size_u32());
+
+					int k = q[p];
+					if (k >= 0)
+					{
+						q[p] = -k - 1;
+
+						bool s = m.erase(counted_obj(k));
+						HASHMAP_TEST_VERIFY(s);
+					}
+				}
+
+				typedef basisu::hash_map<uint32_t, empty_type> uint_hash_set;
+				uint_hash_set s;
+
+				for (uint32_t i = 0; i < q.size(); i++)
+				{
+					int v = q[i];
+
+					if (v >= 0)
+					{
+						my_hash_map::const_iterator it = m.find(counted_obj(v));
+						HASHMAP_TEST_VERIFY(it != m.end());
+						HASHMAP_TEST_VERIFY(it->first == (uint32_t)v);
+						HASHMAP_TEST_VERIFY(it->second == ((uint32_t)v ^ 0xdeadbeef));
+
+						s.insert(v);
+					}
+					else
+					{
+						my_hash_map::const_iterator it = m.find(counted_obj(-v - 1));
+						HASHMAP_TEST_VERIFY(it == m.end());
+					}
+				}
+
+				uint32_t found_count = 0;
+				for (my_hash_map::const_iterator it = m.begin(); it != m.end(); ++it)
+				{
+					HASHMAP_TEST_VERIFY(it->second == ((uint32_t)it->first ^ 0xdeadbeef));
+
+					uint_hash_set::const_iterator fit(s.find((uint32_t)it->first));
+					HASHMAP_TEST_VERIFY(fit != s.end());
+
+					HASHMAP_TEST_VERIFY(fit->first == it->first);
+
+					found_count++;
+				}
+
+				HASHMAP_TEST_VERIFY(found_count == s.size());
+			}
+
+			HASHMAP_TEST_VERIFY(counted_obj::m_count == m.size() * 2);
+		}
+	}
 
 #endif // BASISU_HASHMAP_TEST
 
+	// String formatting
+
+	bool fmt_variant::to_string(std::string& res, std::string& fmt) const
+	{
+		res.resize(0);
+
+		// Scan for allowed formatting characters.
+		for (size_t i = 0; i < fmt.size(); i++)
+		{
+			const char c = fmt[i];
+
+			if (isdigit(c) || (c == '.') || (c == ' ') || (c == '#') || (c == '+') || (c == '-'))
+				continue;
+
+			if (isalpha(c))
+			{
+				if ((i + 1) == fmt.size())
+					continue;
+			}
+
+			return false;
+		}
+
+		if (fmt.size() && (fmt.back() == 'c'))
+		{
+			if ((m_type == variant_type::cI32) || (m_type == variant_type::cU32))
+			{
+				if (m_u32 > 255)
+					return false;
+
+				// Explictly allowing caller to pass in a char of 0, which is ignored.
+				if (m_u32)
+					res.push_back((uint8_t)m_u32);
+				return true;
+			}
+			else
+				return false;
+		}
+
+		switch (m_type)
+		{
+		case variant_type::cInvalid:
+		{
+			return false;
+		}
+		case variant_type::cI32:
+		{
+			if (fmt.size())
+			{
+				int e = fmt.back();
+				if (isalpha(e))
+				{
+					if ((e != 'x') && (e != 'X') && (e != 'i') && (e != 'd') && (e != 'u'))
+						return false;
+				}
+				else
+				{
+					fmt += "i";
+				}
+
+				res = string_format((std::string("%") + fmt).c_str(), m_i32);
+			}
+			else
+			{
+				res = string_format("%i", m_i32);
+			}
+			break;
+		}
+		case variant_type::cU32:
+		{
+			if (fmt.size())
+			{
+				int e = fmt.back();
+				if (isalpha(e))
+				{
+					if ((e != 'x') && (e != 'X') && (e != 'i') && (e != 'd') && (e != 'u'))
+						return false;
+				}
+				else
+				{
+					fmt += "u";
+				}
+
+				res = string_format((std::string("%") + fmt).c_str(), m_u32);
+			}
+			else
+			{
+				res = string_format("%u", m_u32);
+			}
+			break;
+		}
+		case variant_type::cI64:
+		{
+			if (fmt.size())
+			{
+				int e = fmt.back();
+				if (isalpha(e))
+				{
+					if (e == 'x')
+					{
+						fmt.pop_back();
+						fmt += PRIx64;
+					}
+					else if (e == 'X')
+					{
+						fmt.pop_back();
+						fmt += PRIX64;
+					}
+					else
+						return false;
+				}
+				else
+				{
+					fmt += PRId64;
+				}
+
+				res = string_format((std::string("%") + fmt).c_str(), m_i64);
+			}
+			else
+			{
+				res = string_format("%" PRId64, m_i64);
+			}
+			break;
+		}
+		case variant_type::cU64:
+		{
+			if (fmt.size())
+			{
+				int e = fmt.back();
+				if (isalpha(e))
+				{
+					if (e == 'x')
+					{
+						fmt.pop_back();
+						fmt += PRIx64;
+					}
+					else if (e == 'X')
+					{
+						fmt.pop_back();
+						fmt += PRIX64;
+					}
+					else
+						return false;
+				}
+				else
+				{
+					fmt += PRIu64;
+				}
+
+				res = string_format((std::string("%") + fmt).c_str(), m_u64);
+			}
+			else
+			{
+				res = string_format("%" PRIu64, m_u64);
+			}
+			break;
+		}
+		case variant_type::cFlt:
+		{
+			if (fmt.size())
+			{
+				int e = fmt.back();
+				if (isalpha(e))
+				{
+					if ((e != 'f') && (e != 'g') && (e != 'e') && (e != 'E'))
+						return false;
+				}
+				else
+				{
+					fmt += "f";
+				}
+
+				res = string_format((std::string("%") + fmt).c_str(), m_flt);
+			}
+			else
+			{
+				res = string_format("%f", m_flt);
+			}
+			break;
+		}
+		case variant_type::cDbl:
+		{
+			if (fmt.size())
+			{
+				int e = fmt.back();
+				if (isalpha(e))
+				{
+					if ((e != 'f') && (e != 'g') && (e != 'e') && (e != 'E'))
+						return false;
+				}
+				else
+				{
+					fmt += "f";
+				}
+
+				res = string_format((std::string("%") + fmt).c_str(), m_dbl);
+			}
+			else
+			{
+				res = string_format("%f", m_dbl);
+			}
+			break;
+		}
+		case variant_type::cStrPtr:
+		{
+			if (fmt.size())
+				return false;
+			if (!m_pStr)
+				return false;
+			res = m_pStr;
+			break;
+		}
+		case variant_type::cBool:
+		{
+			if (fmt.size())
+				return false;
+			res = m_bool ? "true" : "false";
+			break;
+		}
+		case variant_type::cStdStr:
+		{
+			if (fmt.size())
+				return false;
+			res = m_str;
+			break;
+		}
+		default:
+		{
+			return false;
+		}
+		}
+
+		return true;
+	}
+
+	bool fmt_variants(std::string& res, const char* pFmt, const fmt_variant_vec& variants)
+	{
+		res.resize(0);
+
+		// Must specify a format string
+		if (!pFmt)
+		{
+			assert(0);
+			return false;
+		}
+
+		// Check format string's length
+		const size_t fmt_len = strlen(pFmt);
+		if (!fmt_len)
+		{
+			if (variants.size())
+			{
+				assert(0);
+				return false;
+			}
+			return true;
+		}
+
+		// Wildly estimate output length
+		res.reserve(fmt_len + 32);
+
+		std::string var_fmt;
+		var_fmt.reserve(16);
+
+		std::string tmp;
+		tmp.reserve(16);
+
+		size_t variant_index = 0;
+		bool inside_brackets = false;
+		const char* p = pFmt;
+
+		while (*p)
+		{
+			const uint8_t c = *p++;
+
+			if (inside_brackets)
+			{
+				if (c == '}')
+				{
+					inside_brackets = false;
+
+					if (variant_index >= variants.size())
+					{
+						assert(0);
+						return false;
+					}
+
+					if (!variants[variant_index].to_string(tmp, var_fmt))
+					{
+						assert(0);
+						return false;
+					}
+
+					res += tmp;
+
+					variant_index++;
+				}
+				else
+				{
+					// Check for forbidden formatting characters.
+					if ((c == '*') || (c == 'n') || (c == '%'))
+					{
+						assert(0);
+						return false;
+					}
+
+					var_fmt.push_back(c);
+				}
+			}
+			else if (c == '{')
+			{
+				// Check for escaped '{'
+				if (*p == '{')
+				{
+					res.push_back((char)c);
+					p++;
+				}
+				else
+				{
+					inside_brackets = true;
+					var_fmt.resize(0);
+				}
+			}
+			else
+			{
+				res.push_back((char)c);
+			}
+		}
+
+		if (inside_brackets)
+		{
+			assert(0);
+			return false;
+		}
+
+		if (variant_index != variants.size())
+		{
+			assert(0);
+			return false;
+		}
+
+		return true;
+	}
+
 } // namespace basisu

+ 7 - 4
thirdparty/basis_universal/transcoder/basisu_file_headers.h

@@ -38,7 +38,7 @@ namespace basist
 		basisu::packed_uint<2> m_orig_width;	// The original image width (may not be a multiple of 4 pixels)
 		basisu::packed_uint<2> m_orig_height;  // The original image height (may not be a multiple of 4 pixels)
 
-		basisu::packed_uint<2> m_num_blocks_x;	// The slice's block X dimensions. Each block is 4x4 pixels. The slice's pixel resolution may or may not be a power of 2.
+		basisu::packed_uint<2> m_num_blocks_x;	// The slice's block X dimensions. Each block is 4x4 or 6x6 pixels. The slice's pixel resolution may or may not be a power of 2.
 		basisu::packed_uint<2> m_num_blocks_y;	// The slice's block Y dimensions. 
 
 		basisu::packed_uint<4> m_file_ofs;		// Offset from the start of the file to the start of the slice's data
@@ -72,9 +72,9 @@ namespace basist
 	// We do make sure the various constraints are followed (2DArray/cubemap/videoframes/volume implies that each image has the same resolution and # of mipmap levels, etc., cubemap implies that the # of image slices is a multiple of 6)
 	enum basis_texture_type
 	{
-		cBASISTexType2D = 0,					// An arbitrary array of 2D RGB or RGBA images with optional mipmaps, array size = # images, each image may have a different resolution and # of mipmap levels
+		cBASISTexType2D = 0,				// An arbitrary array of 2D RGB or RGBA images with optional mipmaps, array size = # images, each image may have a different resolution and # of mipmap levels
 		cBASISTexType2DArray = 1,			// An array of 2D RGB or RGBA images with optional mipmaps, array size = # images, each image has the same resolution and mipmap levels
-		cBASISTexTypeCubemapArray = 2,	// an array of cubemap levels, total # of images must be divisable by 6, in X+, X-, Y+, Y-, Z+, Z- order, with optional mipmaps
+		cBASISTexTypeCubemapArray = 2,		// an array of cubemap levels, total # of images must be divisable by 6, in X+, X-, Y+, Y-, Z+, Z- order, with optional mipmaps
 		cBASISTexTypeVideoFrames = 3,		// An array of 2D video frames, with optional mipmaps, # frames = # images, each image has the same resolution and # of mipmap levels
 		cBASISTexTypeVolume = 4,			// A 3D texture with optional mipmaps, Z dimension = # images, each image has the same resolution and # of mipmap levels
 
@@ -90,7 +90,10 @@ namespace basist
 	{
 		cETC1S = 0,
 		cUASTC4x4 = 1,
-		cUASTC_HDR_4x4 = 2
+		cUASTC_HDR_4x4 = 2,
+		cASTC_HDR_6x6 = 3,
+		cASTC_HDR_6x6_INTERMEDIATE = 4,
+		cTotalFormats
 	};
 
 	struct basis_file_header

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 769 - 81
thirdparty/basis_universal/transcoder/basisu_transcoder.cpp


+ 253 - 53
thirdparty/basis_universal/transcoder/basisu_transcoder.h

@@ -79,30 +79,32 @@ namespace basist
 													// Punch-through alpha is relatively easy to support, but full alpha is harder. This format is only here for completeness so opaque-only is fine for now.
 													// See the BASISU_USE_ORIGINAL_3DFX_FXT1_ENCODING macro in basisu_transcoder_internal.h.
 
-		cTFPVRTC2_4_RGB = 18,					// Opaque-only, almost BC1 quality, much faster to transcode and supports arbitrary texture dimensions (unlike PVRTC1 RGB).
-		cTFPVRTC2_4_RGBA = 19,					// Opaque+alpha, slower to encode than cTFPVRTC2_4_RGB. Premultiplied alpha is highly recommended, otherwise the color channel can leak into the alpha channel on transparent blocks.
+		cTFPVRTC2_4_RGB = 18,						// Opaque-only, almost BC1 quality, much faster to transcode and supports arbitrary texture dimensions (unlike PVRTC1 RGB).
+		cTFPVRTC2_4_RGBA = 19,						// Opaque+alpha, slower to encode than cTFPVRTC2_4_RGB. Premultiplied alpha is highly recommended, otherwise the color channel can leak into the alpha channel on transparent blocks.
 
-		cTFETC2_EAC_R11 = 20,					// R only (ETC2 EAC R11 unsigned)
-		cTFETC2_EAC_RG11 = 21,					// RG only (ETC2 EAC RG11 unsigned), R=opaque.r, G=alpha - for tangent space normal maps
+		cTFETC2_EAC_R11 = 20,						// R only (ETC2 EAC R11 unsigned)
+		cTFETC2_EAC_RG11 = 21,						// RG only (ETC2 EAC RG11 unsigned), R=opaque.r, G=alpha - for tangent space normal maps
 
-		cTFBC6H = 22,							// HDR, RGB only, unsigned
-		cTFASTC_HDR_4x4_RGBA = 23,				// HDR, RGBA (currently UASTC HDR is only RGB), unsigned
+		cTFBC6H = 22,								// HDR, RGB only, unsigned
+		cTFASTC_HDR_4x4_RGBA = 23,					// HDR, RGBA (currently UASTC HDR 4x4 encoders are only RGB), unsigned
 
 		// Uncompressed (raw pixel) formats
 		// Note these uncompressed formats (RGBA32, 565, and 4444) can only be transcoded to from LDR input files (ETC1S or UASTC LDR).
-		cTFRGBA32 = 13,							// 32bpp RGBA image stored in raster (not block) order in memory, R is first byte, A is last byte.
-		cTFRGB565 = 14,							// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 11
-		cTFBGR565 = 15,							// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 0
-		cTFRGBA4444 = 16,						// 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0
+		cTFRGBA32 = 13,								// 32bpp RGBA image stored in raster (not block) order in memory, R is first byte, A is last byte.
+		cTFRGB565 = 14,								// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 11
+		cTFBGR565 = 15,								// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 0
+		cTFRGBA4444 = 16,							// 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0
 		
-		// Note these uncompressed formats (HALF and 9E5) can only be transcoded to from HDR input files (UASTC HDR).
-		cTFRGB_HALF = 24,						// 48bpp RGB half (16-bits/component, 3 components)
-		cTFRGBA_HALF = 25,						// 64bpp RGBA half (16-bits/component, 4 components) (A will always currently 1.0, UASTC_HDR doesn't support alpha)
-		cTFRGB_9E5 = 26,						// 32bpp RGB 9E5 (shared exponent, positive only, see GL_EXT_texture_shared_exponent)
+		// Note these uncompressed formats (HALF and 9E5) can only be transcoded to from HDR input files (UASTC HDR 4x4 or ASTC HDR 6x6).
+		cTFRGB_HALF = 24,							// 48bpp RGB half (16-bits/component, 3 components)
+		cTFRGBA_HALF = 25,							// 64bpp RGBA half (16-bits/component, 4 components) (A will always currently 1.0, UASTC_HDR doesn't support alpha)
+		cTFRGB_9E5 = 26,							// 32bpp RGB 9E5 (shared exponent, positive only, see GL_EXT_texture_shared_exponent)
 
-		cTFTotalTextureFormats = 27,
+		cTFASTC_HDR_6x6_RGBA = 27,					// HDR, RGBA (currently our ASTC HDR 6x6 encodes are only RGB), unsigned
 
-		// Old enums for compatibility with code compiled against previous versions
+		cTFTotalTextureFormats = 28,
+
+		// ----- The following are old/legacy enums for compatibility with code compiled against previous versions
 		cTFETC1 = cTFETC1_RGB,
 		cTFETC2 = cTFETC2_RGBA,
 		cTFBC1 = cTFBC1_RGB,
@@ -111,8 +113,8 @@ namespace basist
 		cTFBC5 = cTFBC5_RG,
 
 		// Previously, the caller had some control over which BC7 mode the transcoder output. We've simplified this due to UASTC, which supports numerous modes.
-		cTFBC7_M6_RGB = cTFBC7_RGBA,			// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. Highest quality of all the non-ETC1 formats.
-		cTFBC7_M5_RGBA = cTFBC7_RGBA,			// Opaque+alpha, alpha channel will be opaque for opaque .basis files
+		cTFBC7_M6_RGB = cTFBC7_RGBA,				// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. Highest quality of all the non-ETC1 formats.
+		cTFBC7_M5_RGBA = cTFBC7_RGBA,				// Opaque+alpha, alpha channel will be opaque for opaque .basis files
 		cTFBC7_M6_OPAQUE_ONLY = cTFBC7_RGBA,
 		cTFBC7_M5 = cTFBC7_RGBA,
 		cTFBC7_ALT = 7,
@@ -138,6 +140,9 @@ namespace basist
 	// Returns true if the format is HDR.
 	bool basis_transcoder_format_is_hdr(transcoder_texture_format fmt);
 
+	// Returns true if the format is LDR.
+	inline bool basis_transcoder_format_is_ldr(transcoder_texture_format fmt) { return !basis_transcoder_format_is_hdr(fmt); }
+
 	// Returns the basisu::texture_format corresponding to the specified transcoder_texture_format.
 	basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt);
 
@@ -159,14 +164,25 @@ namespace basist
 	// Returns true if the specified format was enabled at compile time, and is supported for the specific basis/ktx2 texture format (ETC1S, UASTC, or UASTC HDR).
 	bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt = basis_tex_format::cETC1S);
 
+	// Returns the block width/height for the specified basis texture file format.
+	uint32_t basis_tex_format_get_block_width(basis_tex_format fmt);
+	uint32_t basis_tex_format_get_block_height(basis_tex_format fmt);
+		
+	bool basis_tex_format_is_hdr(basis_tex_format fmt);
+	inline bool basis_tex_format_is_ldr(basis_tex_format fmt) { return !basis_tex_format_is_hdr(fmt); }
+		
 	// Validates that the output buffer is large enough to hold the entire transcoded texture.
 	// For uncompressed texture formats, most input parameters are in pixels, not blocks. Blocks are 4x4 pixels.
 	bool basis_validate_output_buffer_size(transcoder_texture_format target_format,
 		uint32_t output_blocks_buf_size_in_blocks_or_pixels,
 		uint32_t orig_width, uint32_t orig_height,
 		uint32_t output_row_pitch_in_blocks_or_pixels,
-		uint32_t output_rows_in_pixels,
-		uint32_t total_slice_blocks);
+		uint32_t output_rows_in_pixels);
+
+	// Computes the size in bytes of a transcoded image or texture, taking into account the format's block width/height and any minimum size PVRTC1 requirements required by OpenGL.
+	// Note the returned value is not necessarily the # of bytes a transcoder could write to the output buffer due to these minimum PVRTC1 requirements.
+	// (These PVRTC1 requirements are not ours, but OpenGL's.)
+	uint32_t basis_compute_transcoded_image_size_in_bytes(transcoder_texture_format target_format, uint32_t orig_width, uint32_t orig_height);
 
 	class basisu_transcoder;
 
@@ -197,7 +213,9 @@ namespace basist
 		}
 	};
 
-	// Low-level helper class that does the actual transcoding.
+	// Low-level helper classes that do the actual transcoding.
+	
+	// ETC1S
 	class basisu_lowlevel_etc1s_transcoder
 	{
 		friend class basisu_transcoder;
@@ -216,18 +234,18 @@ namespace basist
 
 		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
 			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const bool is_video, const bool is_alpha_slice, const uint32_t level_index, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
-			basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0);
+			basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0, uint32_t decode_flags = 0);
 
 		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
 			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
-			basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0)
+			basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0, uint32_t decode_flags = 0)
 		{
 			return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt, output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks,
 				header.m_tex_type == cBASISTexTypeVideoFrames, (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0, slice_desc.m_level_index,
 				slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels, pState,
 				astc_transcode_alpha,
 				pAlpha_blocks,
-				output_rows_in_pixels);
+				output_rows_in_pixels, decode_flags);
 		}
 
 		// Container independent transcoding
@@ -292,15 +310,92 @@ namespace basist
 		// Used internally when decoding formats like ASTC that require both color and alpha data to be available when transcoding to the output format.
 		cDecodeFlagsOutputHasAlphaIndices = 16,
 
-		cDecodeFlagsHighQuality = 32
+		cDecodeFlagsHighQuality = 32,
+
+		cDecodeFlagsNoETC1SChromaFiltering = 64
+	};
+
+	// UASTC LDR 4x4
+	class basisu_lowlevel_uastc_ldr_4x4_transcoder
+	{
+		friend class basisu_transcoder;
+
+	public:
+		basisu_lowlevel_uastc_ldr_4x4_transcoder();
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0);
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0)
+		{
+			return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt,
+				output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, (header.m_flags & cBASISHeaderFlagHasAlphaSlices) != 0, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels,
+				pState, output_rows_in_pixels, channel0, channel1, decode_flags);
+		}
+
+		// Container independent transcoding
+		bool transcode_image(
+			transcoder_texture_format target_format,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+			uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+			uint32_t slice_offset, uint32_t slice_length,
+			uint32_t decode_flags = 0,
+			bool has_alpha = false,
+			bool is_video = false,
+			uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr,
+			uint32_t output_rows_in_pixels = 0,
+			int channel0 = -1, int channel1 = -1);
+	};
+
+	// UASTC HDR 4x4
+	class basisu_lowlevel_uastc_hdr_4x4_transcoder
+	{
+		friend class basisu_transcoder;
+
+	public:
+		basisu_lowlevel_uastc_hdr_4x4_transcoder();
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0);
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0)
+		{
+			return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt,
+				output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, (header.m_flags & cBASISHeaderFlagHasAlphaSlices) != 0, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels,
+				pState, output_rows_in_pixels, channel0, channel1, decode_flags);
+		}
+
+		// Container independent transcoding
+		bool transcode_image(
+			transcoder_texture_format target_format,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+			uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+			uint32_t slice_offset, uint32_t slice_length,
+			uint32_t decode_flags = 0,
+			bool has_alpha = false,
+			bool is_video = false,
+			uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr,
+			uint32_t output_rows_in_pixels = 0,
+			int channel0 = -1, int channel1 = -1);
 	};
 
-	class basisu_lowlevel_uastc_transcoder
+	// ASTC HDR 6x6
+	class basisu_lowlevel_astc_hdr_6x6_transcoder
 	{
 		friend class basisu_transcoder;
 
 	public:
-		basisu_lowlevel_uastc_transcoder();
+		basisu_lowlevel_astc_hdr_6x6_transcoder();
 
 		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
 			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
@@ -331,12 +426,13 @@ namespace basist
 			int channel0 = -1, int channel1 = -1);
 	};
 
-	class basisu_lowlevel_uastc_hdr_transcoder
+	// ASTC HDR 6x6 intermediate
+	class basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder
 	{
 		friend class basisu_transcoder;
 
 	public:
-		basisu_lowlevel_uastc_hdr_transcoder();
+		basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder();
 
 		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
 			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
@@ -379,6 +475,9 @@ namespace basist
 		uint32_t m_num_blocks_y;
 		uint32_t m_total_blocks;
 
+		uint32_t m_block_width;
+		uint32_t m_block_height;
+
 		uint32_t m_compressed_size;
 
 		uint32_t m_slice_index;	// the slice index in the .basis file
@@ -400,10 +499,13 @@ namespace basist
 
 		uint32_t m_orig_width;
 		uint32_t m_orig_height;
-
+				
 		uint32_t m_width;
 		uint32_t m_height;
 
+		uint32_t m_block_width;
+		uint32_t m_block_height;
+
 		uint32_t m_num_blocks_x;
 		uint32_t m_num_blocks_y;
 		uint32_t m_total_blocks;
@@ -425,6 +527,9 @@ namespace basist
 		uint32_t m_width;
 		uint32_t m_height;
 
+		uint32_t m_block_width;
+		uint32_t m_block_height;
+
 		uint32_t m_num_blocks_x;
 		uint32_t m_num_blocks_y;
 		uint32_t m_total_blocks;
@@ -474,6 +579,9 @@ namespace basist
 
 		basis_tex_format m_tex_format; // ETC1S, UASTC, etc.
 
+		uint32_t m_block_width;
+		uint32_t m_block_height;
+
 		bool m_y_flipped;				// true if the image was Y flipped
 		bool m_etc1s;					// true if the file is ETC1S
 		bool m_has_alpha_slices;	// true if the texture has alpha slices (for ETC1S: even slices RGB, odd slices alpha)
@@ -502,7 +610,7 @@ namespace basist
 		// Note that the number of mipmap levels for each image may differ, and that images may have different resolutions.
 		uint32_t get_total_images(const void* pData, uint32_t data_size) const;
 
-		basis_tex_format get_tex_format(const void* pData, uint32_t data_size) const;
+		basis_tex_format get_basis_tex_format(const void* pData, uint32_t data_size) const;
 
 		// Returns the number of mipmap levels in an image.
 		uint32_t get_total_image_levels(const void* pData, uint32_t data_size, uint32_t image_index) const;
@@ -532,7 +640,7 @@ namespace basist
 		// It'll first find the slice(s) to transcode, then call transcode_slice() one or two times to decode both the color and alpha texture data (or RG texture data from two slices for BC5).
 		// If the .basis file doesn't have alpha slices, the output alpha blocks will be set to fully opaque (all 255's).
 		// Currently, to decode to PVRTC1 the basis texture's dimensions in pixels must be a power of 2, due to PVRTC1 format requirements. 
-		// output_blocks_buf_size_in_blocks_or_pixels should be at least the image level's total_blocks (num_blocks_x * num_blocks_y), or the total number of output pixels if fmt==cTFRGBA32.
+		// output_blocks_buf_size_in_blocks_or_pixels should be at least the image level's total_blocks (num_blocks_x * num_blocks_y), or the total number of output pixels if fmt==cTFRGBA32 etc.
 		// output_row_pitch_in_blocks_or_pixels: Number of blocks or pixels per row. If 0, the transcoder uses the slice's num_blocks_x or orig_width (NOT num_blocks_x * 4). Ignored for PVRTC1 (due to texture swizzling).
 		// output_rows_in_pixels: Ignored unless fmt is uncompressed (cRGBA32, etc.). The total number of output rows in the output buffer. If 0, the transcoder assumes the slice's orig_height (NOT num_blocks_y * 4).
 		// Notes: 
@@ -574,13 +682,15 @@ namespace basist
 		const basisu_lowlevel_etc1s_transcoder& get_lowlevel_etc1s_decoder() const { return m_lowlevel_etc1s_decoder; }
 		basisu_lowlevel_etc1s_transcoder& get_lowlevel_etc1s_decoder() { return m_lowlevel_etc1s_decoder; }
 
-		const basisu_lowlevel_uastc_transcoder& get_lowlevel_uastc_decoder() const { return m_lowlevel_uastc_decoder; }
-		basisu_lowlevel_uastc_transcoder& get_lowlevel_uastc_decoder() { return m_lowlevel_uastc_decoder; }
+		const basisu_lowlevel_uastc_ldr_4x4_transcoder& get_lowlevel_uastc_decoder() const { return m_lowlevel_uastc_decoder; }
+		basisu_lowlevel_uastc_ldr_4x4_transcoder& get_lowlevel_uastc_decoder() { return m_lowlevel_uastc_decoder; }
 
 	private:
 		mutable basisu_lowlevel_etc1s_transcoder m_lowlevel_etc1s_decoder;
-		mutable basisu_lowlevel_uastc_transcoder m_lowlevel_uastc_decoder;
-		mutable basisu_lowlevel_uastc_hdr_transcoder m_lowlevel_uastc_hdr_decoder;
+		mutable basisu_lowlevel_uastc_ldr_4x4_transcoder m_lowlevel_uastc_decoder;
+		mutable basisu_lowlevel_uastc_hdr_4x4_transcoder m_lowlevel_uastc_4x4_hdr_decoder;
+		mutable basisu_lowlevel_astc_hdr_6x6_transcoder m_lowlevel_astc_6x6_hdr_decoder;
+		mutable basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder m_lowlevel_astc_6x6_hdr_intermediate_decoder;
 
 		bool m_ready_to_transcode;
 
@@ -654,6 +764,12 @@ namespace basist
 		basisu::packed_uint<4> m_alpha_slice_byte_length;
 	};
 
+	struct ktx2_astc_hdr_6x6_intermediate_image_desc
+	{
+		basisu::packed_uint<4> m_rgb_slice_byte_offset;
+		basisu::packed_uint<4> m_rgb_slice_byte_length;
+	};
+
 	struct ktx2_animdata
 	{
 		basisu::packed_uint<4> m_duration;
@@ -663,10 +779,22 @@ namespace basist
 #pragma pack(pop)
 
 	const uint32_t KTX2_VK_FORMAT_UNDEFINED = 0;
-	const uint32_t KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK = 1000066000; // TODO, is this correct?
-	const uint32_t KTX2_KDF_DF_MODEL_UASTC = 166;
-	const uint32_t KTX2_KDF_DF_MODEL_UASTC_HDR = 167;
-	const uint32_t KTX2_KDF_DF_MODEL_ETC1S = 163;
+	
+	// These are standard Vulkan texture VkFormat ID's, see https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFormat.html
+	const uint32_t KTX2_FORMAT_ASTC_4x4_SFLOAT_BLOCK = 1000066000;
+	const uint32_t KTX2_FORMAT_ASTC_5x4_SFLOAT_BLOCK = 1000066001;
+	const uint32_t KTX2_FORMAT_ASTC_5x5_SFLOAT_BLOCK = 1000066002;
+	const uint32_t KTX2_FORMAT_ASTC_6x5_SFLOAT_BLOCK = 1000066003;
+	const uint32_t KTX2_FORMAT_ASTC_6x6_SFLOAT_BLOCK = 1000066004;
+	const uint32_t KTX2_FORMAT_ASTC_8x5_SFLOAT_BLOCK = 1000066005;
+	const uint32_t KTX2_FORMAT_ASTC_8x6_SFLOAT_BLOCK = 1000066006;
+
+	const uint32_t KTX2_KDF_DF_MODEL_ASTC = 162; // 0xA2
+	const uint32_t KTX2_KDF_DF_MODEL_ETC1S = 163; // 0xA3
+	const uint32_t KTX2_KDF_DF_MODEL_UASTC_LDR_4X4 = 166; // 0xA6
+	const uint32_t KTX2_KDF_DF_MODEL_UASTC_HDR_4X4 = 167; // 0xA7
+	const uint32_t KTX2_KDF_DF_MODEL_ASTC_HDR_6X6_INTERMEDIATE = 168; // 0xA8, TODO - coordinate with Khronos on this
+	
 	const uint32_t KTX2_IMAGE_IS_P_FRAME = 2;
 	const uint32_t KTX2_UASTC_BLOCK_SIZE = 16; // also the block size for UASTC_HDR
 	const uint32_t KTX2_MAX_SUPPORTED_LEVEL_COUNT = 16; // this is an implementation specific constraint and can be increased
@@ -679,7 +807,8 @@ namespace basist
 	{
 		KTX2_SS_NONE = 0,
 		KTX2_SS_BASISLZ = 1,
-		KTX2_SS_ZSTANDARD = 2
+		KTX2_SS_ZSTANDARD = 2,
+		KTX2_SS_BASIS
 	};
 
 	extern const uint8_t g_ktx2_file_identifier[12];
@@ -779,11 +908,15 @@ namespace basist
 		// The image's physical width/height, which will always be divisible by 4 pixels.
 		uint32_t m_width;
 		uint32_t m_height;
-
-		// The texture's dimensions in 4x4 texel blocks.
+				
+		// The texture's dimensions in 4x4 or 6x6 texel blocks.
 		uint32_t m_num_blocks_x;
 		uint32_t m_num_blocks_y;
 
+		// The format's block width/height (currently either 4 or 6).
+		uint32_t m_block_width;
+		uint32_t m_block_height;
+
 		// The total number of blocks
 		uint32_t m_total_blocks;
 
@@ -853,14 +986,38 @@ namespace basist
 		// Returns 0 or the number of layers in the texture array or texture video. Valid after init().
 		uint32_t get_layers() const { return m_header.m_layer_count; }
 
-		// Returns cETC1S, cUASTC4x4, or cUASTC_HDR_4x4. Valid after init().
-		basist::basis_tex_format get_format() const { return m_format; } 
-				
-		bool is_etc1s() const { return get_format() == basist::basis_tex_format::cETC1S; }
+		// Returns cETC1S, cUASTC4x4, cUASTC_HDR_4x4, cASTC_HDR_6x6, cASTC_HDR_6x6_INTERMEDIATE. Valid after init().
+		basist::basis_tex_format get_basis_tex_format() const { return m_format; }
+
+		// ETC1S LDR 4x4
+		bool is_etc1s() const { return get_basis_tex_format() == basist::basis_tex_format::cETC1S; }
 
-		bool is_uastc() const { return get_format() == basist::basis_tex_format::cUASTC4x4; }
+		// UASTC LDR 4x4 (only)
+		bool is_uastc() const { return get_basis_tex_format() == basist::basis_tex_format::cUASTC4x4; }
+
+		// Is ASTC HDR 4x4 or 6x6
+		bool is_hdr() const
+		{
+			return basis_tex_format_is_hdr(get_basis_tex_format());
+		}
 
-		bool is_hdr() const { return get_format() == basist::basis_tex_format::cUASTC_HDR_4x4; }
+		bool is_ldr() const
+		{
+			return !is_hdr();
+		}
+
+		bool is_hdr_4x4() const
+		{
+			return (get_basis_tex_format() == basist::basis_tex_format::cUASTC_HDR_4x4);
+		}
+
+		bool is_hdr_6x6() const
+		{
+			return (get_basis_tex_format() == basist::basis_tex_format::cASTC_HDR_6x6) || (get_basis_tex_format() == basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE);
+		}
+
+		uint32_t get_block_width() const { return basis_tex_format_get_block_width(get_basis_tex_format()); }
+		uint32_t get_block_height() const { return basis_tex_format_get_block_height(get_basis_tex_format());	}
 
 		// Returns true if the ETC1S file has two planes (typically RGBA, or RRRG), or true if the UASTC file has alpha data. Valid after init().
 		uint32_t get_has_alpha() const { return m_has_alpha; }
@@ -893,10 +1050,12 @@ namespace basist
 		// Key value field data.
 		struct key_value
 		{
-			// The key field is UTF8 and always zero terminated.
+			// The key field is UTF8 and always zero terminated. 
+			// In memory we always append a zero terminator to the key.
 			basisu::uint8_vec m_key;
 
-			// The value may be empty. It consists of raw bytes which may or may not be zero terminated.
+			// The value may be empty. In the KTX2 file it consists of raw bytes which may or may not be zero terminated. 
+			// In memory we always append a zero terminator to the value.
 			basisu::uint8_vec m_value;
 
 			bool operator< (const key_value& rhs) const { return strcmp((const char*)m_key.data(), (const char *)rhs.m_key.data()) < 0; }
@@ -917,12 +1076,17 @@ namespace basist
 		// Returns the array of ETC1S image descriptors, which is only valid after get_etc1s_image_descs() is called.
 		const basisu::vector<ktx2_etc1s_image_desc>& get_etc1s_image_descs() const { return m_etc1s_image_descs; }
 
+		const basisu::vector<ktx2_astc_hdr_6x6_intermediate_image_desc>& get_astc_hdr_6x6_intermediate_image_descs() const { return m_astc_6x6_intermediate_image_descs; }
+
 		// Must have called startTranscoding() first
 		uint32_t get_etc1s_image_descs_image_flags(uint32_t level_index, uint32_t layer_index, uint32_t face_index) const;
 
 		// is_video() is only valid after start_transcoding() is called.
 		// For ETC1S data, if this returns true you must currently transcode the file from first to last frame, in order, without skipping any frames.
 		bool is_video() const { return m_is_video; }
+		
+		// Defaults to 0, only non-zero if the key existed in the source KTX2 file.
+		float get_ldr_hdr_upconversion_nit_multiplier() const { return m_ldr_hdr_upconversion_nit_multiplier; }
 				
 		// start_transcoding() MUST be called before calling transcode_image().
 		// This method decompresses the ETC1S global endpoint/selector codebooks, which is not free, so try to avoid calling it excessively.
@@ -956,6 +1120,7 @@ namespace basist
 		
 		ktx2_etc1s_global_data_header m_etc1s_header;
 		basisu::vector<ktx2_etc1s_image_desc> m_etc1s_image_descs;
+		basisu::vector<ktx2_astc_hdr_6x6_intermediate_image_desc> m_astc_6x6_intermediate_image_descs;
 
 		basist::basis_tex_format m_format;
 					
@@ -967,19 +1132,54 @@ namespace basist
 		ktx2_df_channel_id m_dfd_chan0, m_dfd_chan1;
 								
 		basist::basisu_lowlevel_etc1s_transcoder m_etc1s_transcoder;
-		basist::basisu_lowlevel_uastc_transcoder m_uastc_transcoder;
-		basist::basisu_lowlevel_uastc_hdr_transcoder m_uastc_hdr_transcoder;
+		basist::basisu_lowlevel_uastc_ldr_4x4_transcoder m_uastc_transcoder;
+		basist::basisu_lowlevel_uastc_hdr_4x4_transcoder m_uastc_hdr_transcoder;
+		basist::basisu_lowlevel_astc_hdr_6x6_transcoder m_astc_hdr_6x6_transcoder;
+		basist::basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder m_astc_hdr_6x6_intermediate_transcoder;
 				
 		ktx2_transcoder_state m_def_transcoder_state;
 
 		bool m_has_alpha;
 		bool m_is_video;
+		float m_ldr_hdr_upconversion_nit_multiplier;
 
 		bool decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data);
+		bool read_astc_6x6_hdr_intermediate_global_data();
 		bool decompress_etc1s_global_data();
 		bool read_key_values();
 	};
 
+	// Replaces if the key already exists
+	inline void ktx2_add_key_value(ktx2_transcoder::key_value_vec& key_values, const std::string& key, const std::string& val)
+	{
+		assert(key.size());
+
+		basist::ktx2_transcoder::key_value* p = nullptr;
+
+		// Try to find an existing key
+		for (size_t i = 0; i < key_values.size(); i++)
+		{
+			if (strcmp((const char*)key_values[i].m_key.data(), key.c_str()) == 0)
+			{
+				p = &key_values[i];
+				break;
+			}
+		}
+		
+		if (!p)
+			p = key_values.enlarge(1);
+
+		p->m_key.resize(0);
+		p->m_value.resize(0);
+
+		p->m_key.resize(key.size() + 1);
+		memcpy(p->m_key.data(), key.c_str(), key.size());
+
+		p->m_value.resize(val.size() + 1);
+		if (val.size())
+			memcpy(p->m_value.data(), val.c_str(), val.size());
+	}
+
 #endif // BASISD_SUPPORT_KTX2
 
 	// Returns true if the transcoder was compiled with KTX2 support.

+ 65 - 3
thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h

@@ -20,9 +20,10 @@
 #pragma warning (disable: 4127) //  conditional expression is constant
 #endif
 
-// v1.50: Added UASTC HDR support
-#define BASISD_LIB_VERSION 150
-#define BASISD_VERSION_STRING "01.50"
+// v1.50: Added UASTC HDR 4x4 support
+// v1.60: Added RDO ASTC HDR 6x6 and intermediate support
+#define BASISD_LIB_VERSION 160
+#define BASISD_VERSION_STRING "01.60"
 
 #ifdef _DEBUG
 #define BASISD_BUILD_DEBUG
@@ -91,10 +92,37 @@ namespace basist
 		cUASTC_HDR_4x4,						// HDR, transcodes only to 4x4 HDR ASTC, BC6H, or uncompressed
 		cBC6H,
 		cASTC_HDR_4x4,
+		cASTC_HDR_6x6,
 								
 		cTotalBlockFormats
 	};
 
+	inline uint32_t get_block_width(block_format fmt)
+	{
+		switch (fmt)
+		{
+		case block_format::cFXT1_RGB:
+			return 8;
+		case block_format::cASTC_HDR_6x6:
+			return 6;
+		default:
+			break;
+		}
+		return 4;
+	}
+
+	inline uint32_t get_block_height(block_format fmt)
+	{
+		switch (fmt)
+		{
+		case block_format::cASTC_HDR_6x6:
+			return 6;
+		default:
+			break;
+		}
+		return 4;
+	}
+
 	const int COLOR5_PAL0_PREV_HI = 9, COLOR5_PAL0_DELTA_LO = -9, COLOR5_PAL0_DELTA_HI = 31;
 	const int COLOR5_PAL1_PREV_HI = 21, COLOR5_PAL1_DELTA_LO = -21, COLOR5_PAL1_DELTA_HI = 21;
 	const int COLOR5_PAL2_PREV_HI = 31, COLOR5_PAL2_DELTA_LO = -31, COLOR5_PAL2_DELTA_HI = 9;
@@ -559,6 +587,12 @@ namespace basist
 			return ct.init(total_used_syms, &code_sizes[0]);
 		}
 
+		size_t get_bits_remaining() const
+		{
+			size_t total_bytes_remaining = m_pBuf_end - m_pBuf;
+			return total_bytes_remaining * 8 + m_bit_buf_size;
+		}
+
 	private:
 		uint32_t m_buf_size;
 		const uint8_t *m_pBuf;
@@ -804,6 +838,7 @@ namespace basist
 	const double MIN_DENORM_HALF_FLOAT = 0.000000059604645; // smallest positive subnormal number
 	const double MIN_HALF_FLOAT = 0.00006103515625; // smallest positive normal number
 	const double MAX_HALF_FLOAT = 65504.0; // largest normal number
+	const uint32_t MAX_HALF_FLOAT_AS_INT_BITS = 0x7BFF; // the half float rep for 65504.0
 
 	inline uint32_t get_bits(uint32_t val, int low, int high)
 	{
@@ -975,6 +1010,13 @@ namespace basist
 		return (h * 64 + 30) / 31;
 	}
 
+	// Suboptimal, but very close.
+	inline uint32_t bc6h_half_to_blog(half_float h, uint32_t num_bits)
+	{
+		assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT);
+		return (h * 64 + 30) / (31 * (1 << (16 - num_bits)));
+	}
+
 	struct bc6h_block
 	{
 		uint8_t m_bytes[16];
@@ -987,6 +1029,26 @@ namespace basist
 	void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index]
 	void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index]
 	bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3]);
+
+	struct bc6h_logical_block
+	{
+		uint32_t m_mode;
+		uint32_t m_partition_pattern;	// must be 0 if 1 subset
+		uint32_t m_endpoints[3][4];		// [comp][subset*2+lh_index] - must be already properly packed
+		uint8_t m_weights[16];			// weights must be of the proper size, taking into account skipped MSB's which must be 0
+
+		void clear()
+		{
+			basisu::clear_obj(*this);
+		}
+	};
+
+	void pack_bc6h_block(bc6h_block& dst_blk, bc6h_logical_block& log_blk);
+		
+	namespace bc7_mode_5_encoder
+	{
+		void encode_bc7_mode_5_block(void* pDst_block, color32* pPixels, bool hq_mode);
+	}
 		
 } // namespace basist
 

+ 1 - 1
thirdparty/libktx/lib/basis_transcode.cpp

@@ -658,7 +658,7 @@ ktxTexture2_transcodeUastc(ktxTexture2* This,
     ktxLevelIndexEntry* protoLevelIndex = protoPriv._levelIndex;
     ktx_size_t levelOffsetWrite = 0;
 
-    basisu_lowlevel_uastc_transcoder uit;
+    basisu_lowlevel_uastc_ldr_4x4_transcoder uit;
     // See comment on same declaration in transcodeEtc1s.
     std::vector<basisu_transcoder_state> xcoderStates;
     xcoderStates.resize(This->isVideo ? This->numFaces : 1);

+ 13 - 0
thirdparty/libktx/patches/0003-basisu-1.60.patch

@@ -0,0 +1,13 @@
+diff --git a/thirdparty/libktx/lib/basis_transcode.cpp b/thirdparty/libktx/lib/basis_transcode.cpp
+index d7ecb7a0fd..43ad059150 100644
+--- a/thirdparty/libktx/lib/basis_transcode.cpp
++++ b/thirdparty/libktx/lib/basis_transcode.cpp
+@@ -658,7 +658,7 @@ ktxTexture2_transcodeUastc(ktxTexture2* This,
+     ktxLevelIndexEntry* protoLevelIndex = protoPriv._levelIndex;
+     ktx_size_t levelOffsetWrite = 0;
+ 
+-    basisu_lowlevel_uastc_transcoder uit;
++    basisu_lowlevel_uastc_ldr_4x4_transcoder uit;
+     // See comment on same declaration in transcodeEtc1s.
+     std::vector<basisu_transcoder_state> xcoderStates;
+     xcoderStates.resize(This->isVideo ? This->numFaces : 1);

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov