3 tháng trước cách đây · c24efd27c9
--- a/3rdparty/meshoptimizer/src/allocator.cpp
+++ b/3rdparty/meshoptimizer/src/allocator.cpp
@@ -1,8 +1,17 @@
 
				 // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				 #include "meshoptimizer.h"
			
 
				 
			
 
				+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
			
 
				+meshopt_Allocator::Storage& meshopt_Allocator::storage()
			
 
				+{
			
 
				+	static Storage s = {::operator new, ::operator delete };
			
 
				+	return s;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
			
 
				 {
			
 
				-	meshopt_Allocator::Storage::allocate = allocate;
			
 
				-	meshopt_Allocator::Storage::deallocate = deallocate;
			
 
				+	meshopt_Allocator::Storage& s = meshopt_Allocator::storage();
			
 
				+	s.allocate = allocate;
			
 
				+	s.deallocate = deallocate;
			
 
				 }
			
--- a/3rdparty/meshoptimizer/src/clusterizer.cpp
+++ b/3rdparty/meshoptimizer/src/clusterizer.cpp
@@ -6,11 +6,23 @@
 
				 #include <math.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				+// The block below auto-detects SIMD ISA that can be used on the target platform
			
 
				+#ifndef MESHOPTIMIZER_NO_SIMD
			
 
				+#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64))
			
 
				+#define SIMD_SSE
			
 
				+#include <emmintrin.h>
			
 
				+#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
			
 
				+#define SIMD_NEON
			
 
				+#include <arm_neon.h>
			
 
				+#endif
			
 
				+#endif // !MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				 // This work is based on:
			
 
				 // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
			
 
				 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
			
 
				 // Jack Ritter. An Efficient Bounding Sphere. 1990
			
 
				 // Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
			
 
				+// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
			
 
				 namespace meshopt
			
 
				 {
			
 
				 
			
@@ -148,6 +160,19 @@ static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const un
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count)
			
 
				+{
			
 
				+	// for sparse inputs, it's faster to only clear vertices referenced by the index buffer
			
 
				+	if (vertex_count <= index_count)
			
 
				+		memset(used, -1, vertex_count * sizeof(short));
			
 
				+	else
			
 
				+		for (size_t i = 0; i < index_count; ++i)
			
 
				+		{
			
 
				+			assert(indices[i] < vertex_count);
			
 
				+			used[indices[i]] = -1;
			
 
				+		}
			
 
				+}
			
 
				+
			
 
				 static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
			
 
				 {
			
 
				 	static const float kAxes[7][3] = {
			
@@ -264,21 +289,8 @@ struct Cone
 
				 	float nx, ny, nz;
			
 
				 };
			
 
				 
			
 
				-static float getDistance(float dx, float dy, float dz, bool aa)
			
 
				-{
			
 
				-	if (!aa)
			
 
				-		return sqrtf(dx * dx + dy * dy + dz * dz);
			
 
				-
			
 
				-	float rx = fabsf(dx), ry = fabsf(dy), rz = fabsf(dz);
			
 
				-	float rxy = rx > ry ? rx : ry;
			
 
				-	return rxy > rz ? rxy : rz;
			
 
				-}
			
 
				-
			
 
				 static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
			
 
				 {
			
 
				-	if (cone_weight < 0)
			
 
				-		return 1 + distance / expected_radius;
			
 
				-
			
 
				 	float cone = 1.f - spread * cone_weight;
			
 
				 	float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
			
 
				 
			
@@ -347,15 +359,6 @@ static float computeTriangleCones(Cone* triangles, const unsigned int* indices,
 
				 	return mesh_area;
			
 
				 }
			
 
				 
			
 
				-static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
			
 
				-{
			
 
				-	size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
			
 
				-
			
 
				-	// fill 4b padding with 0
			
 
				-	while (offset & 3)
			
 
				-		meshlet_triangles[offset++] = 0;
			
 
				-}
			
 
				-
			
 
				 static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
			
 
				 {
			
 
				 	short& av = used[a];
			
@@ -373,10 +376,8 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int
 
				 		for (size_t j = 0; j < meshlet.vertex_count; ++j)
			
 
				 			used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
			
 
				 
			
 
				-		finishMeshlet(meshlet, meshlet_triangles);
			
 
				-
			
 
				 		meshlet.vertex_offset += meshlet.vertex_count;
			
 
				-		meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
			
 
				+		meshlet.triangle_offset += meshlet.triangle_count * 3;
			
 
				 		meshlet.vertex_count = 0;
			
 
				 		meshlet.triangle_count = 0;
			
 
				 
			
@@ -452,7 +453,7 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
 
				 			const Cone& tri_cone = triangles[triangle];
			
 
				 
			
 
				 			float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
			
 
				-			float distance = getDistance(dx, dy, dz, cone_weight < 0);
			
 
				+			float distance = sqrtf(dx * dx + dy * dy + dz * dz);
			
 
				 			float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
			
 
				 
			
 
				 			float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
			
@@ -513,7 +514,8 @@ static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& me
 
				 		if (best_neighbor == ~0u)
			
 
				 			continue;
			
 
				 
			
 
				-		float best_neighbor_score = getDistance(triangles[best_neighbor].px - cornerx, triangles[best_neighbor].py - cornery, triangles[best_neighbor].pz - cornerz, false);
			
 
				+		float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz;
			
 
				+		float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz);
			
 
				 
			
 
				 		for (size_t j = 0; j < kMeshletAddSeeds; ++j)
			
 
				 		{
			
@@ -565,7 +567,8 @@ static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_co
 
				 		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
			
 
				 
			
 
				 		unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
			
 
				-		float score = getDistance(triangles[index].px - cornerx, triangles[index].py - cornery, triangles[index].pz - cornerz, false);
			
 
				+		float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz;
			
 
				+		float score = sqrtf(dx * dx + dy * dy + dz * dz);
			
 
				 
			
 
				 		if (live < best_live || (live == best_live && score < best_score))
			
 
				 		{
			
@@ -586,7 +589,7 @@ struct KDNode
 
				 		unsigned int index;
			
 
				 	};
			
 
				 
			
 
				-	// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
			
 
				+	// leaves: axis = 3, children = number of points including this one
			
 
				 	// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
			
 
				 	unsigned int axis : 2;
			
 
				 	unsigned int children : 30;
			
@@ -622,7 +625,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
 
				 
			
 
				 	result.index = indices[0];
			
 
				 	result.axis = 3;
			
 
				-	result.children = unsigned(count - 1);
			
 
				+	result.children = unsigned(count);
			
 
				 
			
 
				 	// all remaining points are stored in nodes immediately following the leaf
			
 
				 	for (size_t i = 1; i < count; ++i)
			
@@ -681,29 +684,37 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
 
				 	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
			
 
				 
			
 
				 	// distance to the right subtree is represented explicitly
			
 
				+	assert(next_offset - offset > 1);
			
 
				 	result.children = unsigned(next_offset - offset - 1);
			
 
				 
			
 
				 	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
			
 
				 }
			
 
				 
			
 
				-static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, bool aa, unsigned int& result, float& limit)
			
 
				+static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
			
 
				 {
			
 
				 	const KDNode& node = nodes[root];
			
 
				 
			
 
				+	if (node.children == 0)
			
 
				+		return;
			
 
				+
			
 
				 	if (node.axis == 3)
			
 
				 	{
			
 
				 		// leaf
			
 
				-		for (unsigned int i = 0; i <= node.children; ++i)
			
 
				+		bool inactive = true;
			
 
				+
			
 
				+		for (unsigned int i = 0; i < node.children; ++i)
			
 
				 		{
			
 
				 			unsigned int index = nodes[root + i].index;
			
 
				 
			
 
				 			if (emitted_flags[index])
			
 
				 				continue;
			
 
				 
			
 
				+			inactive = false;
			
 
				+
			
 
				 			const float* point = points + index * stride;
			
 
				 
			
 
				 			float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
			
 
				-			float distance = getDistance(dx, dy, dz, aa);
			
 
				+			float distance = sqrtf(dx * dx + dy * dy + dz * dz);
			
 
				 
			
 
				 			if (distance < limit)
			
 
				 			{
			
@@ -711,6 +722,10 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 
				 				limit = distance;
			
 
				 			}
			
 
				 		}
			
 
				+
			
 
				+		// deactivate leaves that no longer have items to emit
			
 
				+		if (inactive)
			
 
				+			nodes[root].children = 0;
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -719,34 +734,84 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 
				 		unsigned int first = (delta <= 0) ? 0 : node.children;
			
 
				 		unsigned int second = first ^ node.children;
			
 
				 
			
 
				-		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, aa, result, limit);
			
 
				+		// deactivate branches that no longer have items to emit to accelerate traversal
			
 
				+		// note that we do this *before* recursing which delays deactivation but keeps tail calls
			
 
				+		if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
			
 
				+			nodes[root].children = 0;
			
 
				+
			
 
				+		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
			
 
				 
			
 
				 		// only process the other node if it can have a match based on closest distance so far
			
 
				 		if (fabsf(delta) <= limit)
			
 
				-			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, aa, result, limit);
			
 
				+			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+struct BVHBoxT
			
 
				+{
			
 
				+	float min[4];
			
 
				+	float max[4];
			
 
				+};
			
 
				+
			
 
				 struct BVHBox
			
 
				 {
			
 
				 	float min[3];
			
 
				 	float max[3];
			
 
				 };
			
 
				 
			
 
				-static void boxMerge(BVHBox& box, const BVHBox& other)
			
 
				+#if defined(SIMD_SSE)
			
 
				+static float boxMerge(BVHBoxT& box, const BVHBox& other)
			
 
				+{
			
 
				+	__m128 min = _mm_loadu_ps(box.min);
			
 
				+	__m128 max = _mm_loadu_ps(box.max);
			
 
				+
			
 
				+	min = _mm_min_ps(min, _mm_loadu_ps(other.min));
			
 
				+	max = _mm_max_ps(max, _mm_loadu_ps(other.max));
			
 
				+
			
 
				+	_mm_store_ps(box.min, min);
			
 
				+	_mm_store_ps(box.max, max);
			
 
				+
			
 
				+	__m128 size = _mm_sub_ps(max, min);
			
 
				+	__m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));
			
 
				+	__m128 mul = _mm_mul_ps(size, size_yzx);
			
 
				+	__m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1)));
			
 
				+	__m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2)));
			
 
				+
			
 
				+	return _mm_cvtss_f32(sum_xyz);
			
 
				+}
			
 
				+#elif defined(SIMD_NEON)
			
 
				+static float boxMerge(BVHBoxT& box, const BVHBox& other)
			
 
				+{
			
 
				+	float32x4_t min = vld1q_f32(box.min);
			
 
				+	float32x4_t max = vld1q_f32(box.max);
			
 
				+
			
 
				+	min = vminq_f32(min, vld1q_f32(other.min));
			
 
				+	max = vmaxq_f32(max, vld1q_f32(other.max));
			
 
				+
			
 
				+	vst1q_f32(box.min, min);
			
 
				+	vst1q_f32(box.max, max);
			
 
				+
			
 
				+	float32x4_t size = vsubq_f32(max, min);
			
 
				+	float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2);
			
 
				+	float32x4_t mul = vmulq_f32(size, size_yzx);
			
 
				+	float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1);
			
 
				+	float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2);
			
 
				+
			
 
				+	return sum_xyz;
			
 
				+}
			
 
				+#else
			
 
				+static float boxMerge(BVHBoxT& box, const BVHBox& other)
			
 
				 {
			
 
				 	for (int k = 0; k < 3; ++k)
			
 
				 	{
			
 
				 		box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
			
 
				 		box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
			
 
				 	}
			
 
				-}
			
 
				 
			
 
				-inline float boxSurface(const BVHBox& box)
			
 
				-{
			
 
				 	float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
			
 
				 	return sx * sy + sx * sz + sy * sz;
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				 inline unsigned int radixFloat(unsigned int v)
			
 
				 {
			
@@ -832,7 +897,7 @@ static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indi
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices)
			
 
				+static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL)
			
 
				 {
			
 
				 	// count number of unique vertices
			
 
				 	size_t used_vertices = 0;
			
@@ -843,6 +908,9 @@ static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size
 
				 
			
 
				 		used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
			
 
				 		used[a] = used[b] = used[c] = 1;
			
 
				+
			
 
				+		if (out)
			
 
				+			out[i] = unsigned(used_vertices);
			
 
				 	}
			
 
				 
			
 
				 	// reset used[] for future invocations
			
@@ -854,16 +922,16 @@ static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size
 
				 		used[a] = used[b] = used[c] = -1;
			
 
				 	}
			
 
				 
			
 
				-	if (used_vertices > max_vertices)
			
 
				-		return false;
			
 
				+	return used_vertices;
			
 
				+}
			
 
				 
			
 
				+static void bvhPackLeaf(unsigned char* boundary, size_t count)
			
 
				+{
			
 
				 	// mark meshlet boundary for future reassembly
			
 
				 	assert(count > 0);
			
 
				 
			
 
				 	boundary[0] = 1;
			
 
				 	memset(boundary + 1, 0, count - 1);
			
 
				-
			
 
				-	return true;
			
 
				 }
			
 
				 
			
 
				 static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
			
@@ -872,8 +940,9 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size
 
				 	{
			
 
				 		size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
			
 
				 
			
 
				-		if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices))
			
 
				+		if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices)
			
 
				 		{
			
 
				+			bvhPackLeaf(boundary + i, chunk);
			
 
				 			i += chunk;
			
 
				 			continue;
			
 
				 		}
			
@@ -881,7 +950,7 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size
 
				 		// chunk is vertex bound, split it into smaller meshlets
			
 
				 		assert(chunk > max_vertices / 3);
			
 
				 
			
 
				-		bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices);
			
 
				+		bvhPackLeaf(boundary + i, max_vertices / 3);
			
 
				 		i += max_vertices / 3;
			
 
				 	}
			
 
				 }
			
@@ -889,30 +958,32 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size
 
				 static bool bvhDivisible(size_t count, size_t min, size_t max)
			
 
				 {
			
 
				 	// count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
			
 
				-	// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv
			
 
				+	// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
			
 
				 	// we avoid expensive integer divisions in the common case where min is <= max/2
			
 
				 	return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
			
 
				 }
			
 
				 
			
 
				-static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost)
			
 
				+static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count)
			
 
				 {
			
 
				-	BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]];
			
 
				-	float* costs = static_cast<float*>(scratch);
			
 
				+	BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}};
			
 
				+	BVHBoxT accumr = accuml;
			
 
				 
			
 
				-	// accumulate SAH cost in forward and backward directions
			
 
				 	for (size_t i = 0; i < count; ++i)
			
 
				 	{
			
 
				-		boxMerge(accuml, boxes[order[i]]);
			
 
				-		boxMerge(accumr, boxes[order[count - 1 - i]]);
			
 
				+		float larea = boxMerge(accuml, boxes[order[i]]);
			
 
				+		float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]);
			
 
				 
			
 
				-		costs[i] = boxSurface(accuml);
			
 
				-		costs[i + count] = boxSurface(accumr);
			
 
				+		areas[i] = larea;
			
 
				+		areas[i + count] = rarea;
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				+static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost)
			
 
				+{
			
 
				 	bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
			
 
				 	size_t end = aligned ? count - min : count - 1;
			
 
				 
			
 
				-	float rmaxf = 1.f / float(int(max));
			
 
				+	float rmaxfill = 1.f / float(int(maxfill));
			
 
				 
			
 
				 	// find best split that minimizes SAH
			
 
				 	size_t bestsplit = 0;
			
@@ -927,17 +998,22 @@ static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t co
 
				 		if (aligned && !bvhDivisible(rsplit, min, max))
			
 
				 			continue;
			
 
				 
			
 
				-		// costs[x] = inclusive surface area of boxes[0..x]
			
 
				-		// costs[count-1-x] = inclusive surface area of boxes[x..count-1]
			
 
				-		float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count];
			
 
				+		// areas[x] = inclusive surface area of boxes[0..x]
			
 
				+		// areas[count-1-x] = inclusive surface area of boxes[x..count-1]
			
 
				+		float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count];
			
 
				 		float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
			
 
				 
			
 
				 		if (cost > bestcost)
			
 
				 			continue;
			
 
				 
			
 
				-		// fill cost; use floating point math to avoid expensive integer modulo
			
 
				-		int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit);
			
 
				-		int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit);
			
 
				+		// use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count
			
 
				+		// using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice
			
 
				+		size_t lfill = vertices ? vertices[i] : lsplit;
			
 
				+		size_t rfill = vertices ? vertices[i] : rsplit;
			
 
				+
			
 
				+		// fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo
			
 
				+		int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill);
			
 
				+		int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill);
			
 
				 
			
 
				 		cost += fill * (float(lrest) * larea + float(rrest) * rarea);
			
 
				 
			
@@ -973,8 +1049,8 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
 
				 	if (depth >= kMeshletMaxTreeDepth)
			
 
				 		return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
			
 
				 
			
 
				-	if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices))
			
 
				-		return;
			
 
				+	if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
			
 
				+		return bvhPackLeaf(boundary, count);
			
 
				 
			
 
				 	unsigned int* axes[3] = {orderx, ordery, orderz};
			
 
				 
			
@@ -983,9 +1059,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
 
				 
			
 
				 	// if we could not pack the meshlet, we must be vertex bound
			
 
				 	size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
			
 
				-
			
 
				-	// only use fill weight if we are optimizing for triangle count
			
 
				-	float fill = count <= max_triangles ? 0.f : fill_weight;
			
 
				+	size_t maxfill = count <= max_triangles ? max_vertices : max_triangles;
			
 
				 
			
 
				 	// find best split that minimizes SAH
			
 
				 	int bestk = -1;
			
@@ -994,8 +1068,20 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
 
				 
			
 
				 	for (int k = 0; k < 3; ++k)
			
 
				 	{
			
 
				+		float* areas = static_cast<float*>(scratch);
			
 
				+		unsigned int* vertices = NULL;
			
 
				+
			
 
				+		bvhComputeArea(areas, boxes, axes[k], count);
			
 
				+
			
 
				+		if (count <= max_triangles)
			
 
				+		{
			
 
				+			// for vertex bound clusters, count number of unique vertices for each split
			
 
				+			vertices = reinterpret_cast<unsigned int*>(areas + 2 * count);
			
 
				+			bvhCountVertices(axes[k], count, used, indices, vertices);
			
 
				+		}
			
 
				+
			
 
				 		float axiscost = FLT_MAX;
			
 
				-		size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost);
			
 
				+		size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost);
			
 
				 
			
 
				 		if (axissplit && axiscost < bestcost)
			
 
				 		{
			
@@ -1044,7 +1130,6 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
 
				 	assert(index_count % 3 == 0);
			
 
				 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				 	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
			
 
				-	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				 
			
 
				 	(void)kMeshletMaxVertices;
			
 
				 	(void)kMeshletMaxTriangles;
			
@@ -1069,9 +1154,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
 
				 
			
 
				 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				 	assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
			
 
				-	assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				 
			
 
				-	assert(cone_weight <= 1); // negative cone weight switches metric to optimize for axis-aligned meshlets
			
 
				+	assert(cone_weight >= 0 && cone_weight <= 1);
			
 
				 	assert(split_factor >= 0);
			
 
				 
			
 
				 	if (index_count == 0)
			
@@ -1123,7 +1207,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
 
				 
			
 
				 	// index of the vertex in the meshlet, -1 if the vertex isn't used
			
 
				 	short* used = allocator.allocate<short>(vertex_count);
			
 
				-	memset(used, -1, vertex_count * sizeof(short));
			
 
				+	clearUsed(used, vertex_count, indices, index_count);
			
 
				 
			
 
				 	// initial seed triangle is the one closest to the corner
			
 
				 	unsigned int initial_seed = ~0u;
			
@@ -1133,7 +1217,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
 
				 	{
			
 
				 		const Cone& tri = triangles[i];
			
 
				 
			
 
				-		float score = getDistance(tri.px - cornerx, tri.py - cornery, tri.pz - cornerz, false);
			
 
				+		float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz;
			
 
				+		float score = sqrtf(dx * dx + dy * dy + dz * dz);
			
 
				 
			
 
				 		if (initial_seed == ~0u || score < initial_score)
			
 
				 		{
			
@@ -1173,7 +1258,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
 
				 			unsigned int index = ~0u;
			
 
				 			float distance = FLT_MAX;
			
 
				 
			
 
				-			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, cone_weight < 0.f, index, distance);
			
 
				+			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance);
			
 
				 
			
 
				 			best_triangle = index;
			
 
				 			split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
			
@@ -1243,20 +1328,15 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
 
				 	}
			
 
				 
			
 
				 	if (meshlet.triangle_count)
			
 
				-	{
			
 
				-		finishMeshlet(meshlet, meshlet_triangles);
			
 
				-
			
 
				 		meshlets[meshlet_offset++] = meshlet;
			
 
				-	}
			
 
				 
			
 
				 	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
			
 
				+	assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
			
 
				 	return meshlet_offset;
			
 
				 }
			
 
				 
			
 
				 size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
			
 
				 {
			
 
				-	assert(cone_weight >= 0); // to use negative cone weight, use meshopt_buildMeshletsFlex
			
 
				-
			
 
				 	return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
			
 
				 }
			
 
				 
			
@@ -1268,13 +1348,12 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
 
				 
			
 
				 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				 	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
			
 
				-	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
 
				 
			
 
				 	// index of the vertex in the meshlet, -1 if the vertex isn't used
			
 
				 	short* used = allocator.allocate<short>(vertex_count);
			
 
				-	memset(used, -1, vertex_count * sizeof(short));
			
 
				+	clearUsed(used, vertex_count, indices, index_count);
			
 
				 
			
 
				 	meshopt_Meshlet meshlet = {};
			
 
				 	size_t meshlet_offset = 0;
			
@@ -1289,17 +1368,14 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
 
				 	}
			
 
				 
			
 
				 	if (meshlet.triangle_count)
			
 
				-	{
			
 
				-		finishMeshlet(meshlet, meshlet_triangles);
			
 
				-
			
 
				 		meshlets[meshlet_offset++] = meshlet;
			
 
				-	}
			
 
				 
			
 
				 	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
			
 
				+	assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
			
 
				 	return meshlet_offset;
			
 
				 }
			
 
				 
			
 
				-size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
			
 
				+size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
			
 
				 {
			
 
				 	using namespace meshopt;
			
 
				 
			
@@ -1309,7 +1385,6 @@ size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int
 
				 
			
 
				 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				 	assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
			
 
				-	assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				 
			
 
				 	if (index_count == 0)
			
 
				 		return 0;
			
@@ -1320,13 +1395,14 @@ size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int
 
				 	meshopt_Allocator allocator;
			
 
				 
			
 
				 	// 3 floats plus 1 uint for sorting, or
			
 
				-	// 2 floats for SAH costs, or
			
 
				+	// 2 floats plus 1 uint for pivoting, or
			
 
				 	// 1 uint plus 1 byte for partitioning
			
 
				 	float* scratch = allocator.allocate<float>(face_count * 4);
			
 
				 
			
 
				 	// compute bounding boxes and centroids for sorting
			
 
				-	BVHBox* boxes = allocator.allocate<BVHBox>(face_count);
			
 
				+	BVHBox* boxes = allocator.allocate<BVHBox>(face_count + 1); // padding for SIMD
			
 
				 	bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
			
 
				+	memset(boxes + face_count, 0, sizeof(BVHBox));
			
 
				 
			
 
				 	unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);
			
 
				 	unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;
			
@@ -1350,7 +1426,7 @@ size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int
 
				 
			
 
				 	// index of the vertex in the meshlet, -1 if the vertex isn't used
			
 
				 	short* used = allocator.allocate<short>(vertex_count);
			
 
				-	memset(used, -1, vertex_count * sizeof(short));
			
 
				+	clearUsed(used, vertex_count, indices, index_count);
			
 
				 
			
 
				 	unsigned char* boundary = allocator.allocate<unsigned char>(face_count);
			
 
				 
			
@@ -1391,13 +1467,10 @@ size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int
 
				 	}
			
 
				 
			
 
				 	if (meshlet.triangle_count)
			
 
				-	{
			
 
				-		finishMeshlet(meshlet, meshlet_triangles);
			
 
				-
			
 
				 		meshlets[meshlet_offset++] = meshlet;
			
 
				-	}
			
 
				 
			
 
				 	assert(meshlet_offset <= meshlet_bound);
			
 
				+	assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
			
 
				 	return meshlet_offset;
			
 
				 }
			
 
				 
			
@@ -1693,3 +1766,6 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh
 
				 	assert(vertex_offset <= vertex_count);
			
 
				 	memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
			
 
				 }
			
 
				+
			
 
				+#undef SIMD_SSE
			
 
				+#undef SIMD_NEON
			
--- a/3rdparty/meshoptimizer/src/indexgenerator.cpp
+++ b/3rdparty/meshoptimizer/src/indexgenerator.cpp
@@ -439,6 +439,31 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
 
				 	generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
			
 
				 }
			
 
				 
			
 
				+void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+	VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL};
			
 
				+
			
 
				+	size_t table_size = hashBuckets(vertex_count);
			
 
				+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
			
 
				+	memset(table, -1, table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u);
			
 
				+
			
 
				+		if (*entry == ~0u)
			
 
				+			*entry = unsigned(i);
			
 
				+
			
 
				+		destination[i] = *entry;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				 {
			
 
				 	using namespace meshopt;
			
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * meshoptimizer - version 0.23
			
 
				+ * meshoptimizer - version 0.25
			
 
				  *
			
 
				  * Copyright (C) 2016-2025, by Arseny Kapoulkine ([email protected])
			
 
				  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
			
@@ -12,7 +12,7 @@
 
				 #include <stddef.h>
			
 
				 
			
 
				 /* Version macro; major * 1000 + minor * 10 + patch */
			
 
				-#define MESHOPTIMIZER_VERSION 230 /* 0.23 */
			
 
				+#define MESHOPTIMIZER_VERSION 250 /* 0.25 */
			
 
				 
			
 
				 /* If no API is defined, assume default */
			
 
				 #ifndef MESHOPTIMIZER_API
			
@@ -75,7 +75,7 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
 
				 MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
			
 
				+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
			
 
				  * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence.
			
 
				  * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided).
			
 
				  * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
			
@@ -85,7 +85,7 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destinat
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
			
 
				+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
			
 
				 
			
 
				 /**
			
 
				  * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
			
@@ -124,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
			
 
				 
			
 
				+/**
			
 
				+ * Experimental: Generates a remap table that maps all vertices with the same position to the same (existing) index.
			
 
				+ * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
			
 
				+ * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting remap table (vertex_count elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				 /**
			
 
				  * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
			
 
				  * Each triangle is converted into a 6-vertex patch with the following layout:
			
@@ -154,8 +164,8 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
 
				 MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
			
 
				- * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using nointerpolate attribute.
			
 
				+ * Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
			
 
				+ * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute.
			
 
				  * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader.
			
 
				  * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data.
			
 
				  * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering.
			
@@ -164,7 +174,7 @@ MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* des
 
				  * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				  * reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements)
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count);
			
 
				+MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count);
			
 
				 
			
 
				 /**
			
 
				  * Vertex transform cache optimizer
			
@@ -241,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t
 
				 MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
			
 
				 
			
 
				 /**
			
 
				- * Set index encoder format version
			
 
				+ * Set index encoder format version (defaults to 1)
			
 
				+ *
			
 
				  * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
			
@@ -293,23 +304,27 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
 
				  * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
			
 
				  *
			
 
				  * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
			
 
				+ * vertex_size must be a multiple of 4 (and <= 256)
			
 
				  */
			
 
				 MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
			
 
				 MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Vertex buffer encoder
			
 
				+ * Vertex buffer encoder
			
 
				  * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
			
 
				- * For compression level to take effect, the vertex encoding version must be set to 1 via meshopt_encodeVertexVersion.
			
 
				+ * For compression level to take effect, the vertex encoding version must be set to 1.
			
 
				  * The default compression level implied by meshopt_encodeVertexBuffer is 2.
			
 
				  *
			
 
				+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
			
 
				+ * vertex_size must be a multiple of 4 (and <= 256)
			
 
				  * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
			
 
				  * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0.
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
			
 
				 
			
 
				 /**
			
 
				- * Set vertex encoder format version
			
 
				+ * Set vertex encoder format version (defaults to 1)
			
 
				+ *
			
 
				  * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
			
@@ -321,6 +336,7 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
 
				  * The decoder is safe to use for untrusted input, but it may produce garbage data.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
			
 
				+ * vertex_size must be a multiple of 4 (and <= 256)
			
 
				  */
			
 
				 MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
			
 
				 
			
@@ -343,10 +359,14 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s
 
				  *
			
 
				  * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
			
 
				  * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
			
 
				+ *
			
 
				+ * Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
			
 
				+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
			
 
				 MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
			
 
				 MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
			
 
				 
			
 
				 /**
			
 
				  * Vertex buffer filter encoders
			
@@ -363,6 +383,10 @@ MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_
 
				  * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
			
 
				  * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
			
 
				  * Input data must contain stride/4 floats for every vector (count*stride/4 total).
			
 
				+ *
			
 
				+ * Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
			
 
				+ * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
			
 
				+ * Input data must contain 4 floats for every color (count*4 total).
			
 
				  */
			
 
				 enum meshopt_EncodeExpMode
			
 
				 {
			
@@ -379,6 +403,7 @@ enum meshopt_EncodeExpMode
 
				 MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				 MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				 MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				 
			
 
				 /**
			
 
				  * Simplification options
			
@@ -391,18 +416,34 @@ enum
 
				 	meshopt_SimplifySparse = 1 << 1,
			
 
				 	/* Treat error limit and resulting error as absolute instead of relative to mesh extents. */
			
 
				 	meshopt_SimplifyErrorAbsolute = 1 << 2,
			
 
				-	/* Experimental: remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
			
 
				+	/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
			
 
				 	meshopt_SimplifyPrune = 1 << 3,
			
 
				+	/* Experimental: Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */
			
 
				+	meshopt_SimplifyRegularize = 1 << 4,
			
 
				+	/* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
			
 
				+	meshopt_SimplifyPermissive = 1 << 5,
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs
			
 
				+ */
			
 
				+enum
			
 
				+{
			
 
				+	/* Do not move this vertex. */
			
 
				+	meshopt_SimplifyVertex_Lock = 1 << 0,
			
 
				+	/* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */
			
 
				+	meshopt_SimplifyVertex_Protect = 1 << 1,
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * Mesh simplifier
			
 
				  * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
			
 
				  * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
			
 
				- * If not all attributes from the input mesh are required, it's recommended to reindex the mesh without them prior to simplification.
			
 
				+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
			
 
				  * Returns the number of indices after simplification, with destination containing new index data
			
 
				+ *
			
 
				  * The resulting index buffer references vertices from the original vertex buffer.
			
 
				- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
@@ -414,50 +455,86 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
 
				 
			
 
				 /**
			
 
				  * Mesh simplifier with attribute metric
			
 
				- * The algorithm enhances meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
			
 
				- * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
			
 
				+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
			
 
				+ * Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order.
			
 
				+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
			
 
				+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
			
 
				+ * Returns the number of indices after simplification, with destination containing new index data
			
 
				+ *
			
 
				+ * The resulting index buffer references vertices from the original vertex buffer.
			
 
				+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				+ * Note that the number of attributes with non-zero weights affects memory requirements and running time.
			
 
				  *
			
 
				+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * vertex_attributes should have attribute_count floats for each vertex
			
 
				  * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
			
 
				  * attribute_count must be <= 32
			
 
				  * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
			
 
				+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
			
 
				+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
			
 
				+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				  */
			
 
				 MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				 
			
 
				+/**
			
 
				+ * Experimental: Mesh simplifier with position/attribute update
			
 
				+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
			
 
				+ * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
			
 
				+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
			
 
				+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
			
 
				+ * Returns the number of indices after simplification, indices are destructively updated with new index data
			
 
				+ *
			
 
				+ * The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place.
			
 
				+ * Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification.
			
 
				+ * Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated.
			
 
				+ *
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * vertex_attributes should have attribute_count floats for each vertex
			
 
				+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
			
 
				+ * attribute_count must be <= 32
			
 
				+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
			
 
				+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
			
 
				+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
			
 
				+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				+
			
 
				 /**
			
 
				  * Experimental: Mesh simplifier (sloppy)
			
 
				  * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
			
 
				  * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
			
 
				  * Returns the number of indices after simplification, with destination containing new index data
			
 
				  * The resulting index buffer references vertices from the original vertex buffer.
			
 
				- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position
			
 
				  * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
			
 
				  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Mesh simplifier (pruner)
			
 
				+ * Mesh simplifier (pruner)
			
 
				  * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh
			
 
				  * Returns the number of indices after simplification, with destination containing new index data
			
 
				  * The resulting index buffer references vertices from the original vertex buffer.
			
 
				- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				  * destination must contain enough space for the target index buffer, worst case is index_count elements
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
			
 
				+MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
			
 
				 
			
 
				 /**
			
 
				  * Point cloud simplifier
			
 
				  * Reduces the number of points in the cloud to reach the given target
			
 
				  * Returns the number of points after simplification, with destination containing new index data
			
 
				  * The resulting index buffer references vertices from the original vertex buffer.
			
 
				- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				  * destination must contain enough space for the target index buffer (target_vertex_count elements)
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
@@ -548,12 +625,12 @@ struct meshopt_CoverageStatistics
 
				 };
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Coverage analyzer
			
 
				+ * Coverage analyzer
			
 
				  * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer
			
 
				  *
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
 
				 /**
			
 
				  * Meshlet is a small mesh cluster (subset) that consists of:
			
@@ -582,10 +659,10 @@ struct meshopt_Meshlet
 
				  * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
			
 
				  *
			
 
				  * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
			
 
				- * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
			
 
				- * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
			
 
				+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
			
 
				+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				- * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; max_triangles must be divisible by 4)
			
 
				+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512)
			
 
				  * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
			
 
				  */
			
 
				 MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
			
@@ -596,14 +673,13 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t m
 
				  * Experimental: Meshlet builder with flexible cluster sizes
			
 
				  * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
			
 
				  * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
			
 
				- * Additionally, allows to switch to axis aligned clusters by setting cone_weight to a negative value.
			
 
				  *
			
 
				- * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!)
			
 
				- * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
			
 
				- * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
			
 
				+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
			
 
				+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
			
 
				+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				- * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4)
			
 
				- * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency; additionally, cone_weight can be set to a negative value to prioritize axis aligned clusters (for raytracing) instead
			
 
				+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
			
 
				+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
			
 
				  * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
			
 
				  */
			
 
				 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
			
@@ -612,14 +688,14 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshl
 
				  * Experimental: Meshlet builder that produces clusters optimized for raytracing
			
 
				  * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
			
 
				  *
			
 
				- * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!)
			
 
				- * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
			
 
				- * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
			
 
				+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
			
 
				+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
			
 
				+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				- * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4)
			
 
				+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
			
 
				  * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
			
 
				 
			
 
				 /**
			
 
				  * Meshlet optimizer
			
@@ -674,26 +750,26 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsig
 
				 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Sphere bounds generator
			
 
				+ * Sphere bounds generator
			
 
				  * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0.
			
 
				  *
			
 
				  * positions should have float3 position in the first 12 bytes of each element
			
 
				  * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
			
 
				+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Cluster partitioner
			
 
				+ * Cluster partitioner
			
 
				  * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
			
 
				  *
			
 
				- * destination must contain enough space for the resulting partiotion data (cluster_count elements)
			
 
				+ * destination must contain enough space for the resulting partition data (cluster_count elements)
			
 
				  * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
			
 
				  * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
			
 
				  * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex (or can be NULL if not used)
			
 
				  * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
			
 
				+MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
			
 
				 
			
 
				 /**
			
 
				  * Spatial sorter
			
@@ -706,13 +782,23 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_partitionClusters(unsigned int* destin
 
				 MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Spatial sorter
			
 
				+ * Spatial sorter
			
 
				  * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				  * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Spatial clusterizer
			
 
				+ * Reorders points into clusters optimized for spatial locality, and generates a new index buffer.
			
 
				+ * Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (vertex_count elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size);
			
 
				 
			
 
				 /**
			
 
				  * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
			
@@ -813,13 +899,18 @@ template <typename T>
 
				 inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
			
 
				 template <typename T>
			
 
				 inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
			
 
				+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level);
			
 
				 template <typename T>
			
 
				 inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
			
 
				 template <typename T>
			
 
				 inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
			
 
				 template <typename T>
			
 
				+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
			
 
				+template <typename T>
			
 
				 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
			
 
				 template <typename T>
			
 
				+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
			
 
				+template <typename T>
			
 
				 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
			
 
				 template <typename T>
			
 
				 inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
			
@@ -838,7 +929,7 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int*
 
				 template <typename T>
			
 
				 inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
			
 
				 template <typename T>
			
 
				-inline size_t meshopt_buildMeshletsSplit(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
			
 
				+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
			
 
				 template <typename T>
			
 
				 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 template <typename T>
			
@@ -877,14 +968,21 @@ inline int meshopt_quantizeSnorm(float v, int N)
 
				 class meshopt_Allocator
			
 
				 {
			
 
				 public:
			
 
				-	template <typename T>
			
 
				-	struct StorageT
			
 
				+	struct Storage
			
 
				 	{
			
 
				-		static void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
			
 
				-		static void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
			
 
				+		void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
			
 
				+		void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
			
 
				 	};
			
 
				 
			
 
				-	typedef StorageT<void> Storage;
			
 
				+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
			
 
				+	MESHOPTIMIZER_API static Storage& storage();
			
 
				+#else
			
 
				+	static Storage& storage()
			
 
				+	{
			
 
				+		static Storage s = {::operator new, ::operator delete };
			
 
				+		return s;
			
 
				+	}
			
 
				+#endif
			
 
				 
			
 
				 	meshopt_Allocator()
			
 
				 	    : blocks()
			
@@ -895,14 +993,14 @@ public:
 
				 	~meshopt_Allocator()
			
 
				 	{
			
 
				 		for (size_t i = count; i > 0; --i)
			
 
				-			Storage::deallocate(blocks[i - 1]);
			
 
				+			storage().deallocate(blocks[i - 1]);
			
 
				 	}
			
 
				 
			
 
				 	template <typename T>
			
 
				 	T* allocate(size_t size)
			
 
				 	{
			
 
				 		assert(count < sizeof(blocks) / sizeof(blocks[0]));
			
 
				-		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
			
 
				+		T* result = static_cast<T*>(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
			
 
				 		blocks[count++] = result;
			
 
				 		return result;
			
 
				 	}
			
@@ -910,7 +1008,7 @@ public:
 
				 	void deallocate(void* ptr)
			
 
				 	{
			
 
				 		assert(count > 0 && blocks[count - 1] == ptr);
			
 
				-		Storage::deallocate(ptr);
			
 
				+		storage().deallocate(ptr);
			
 
				 		count--;
			
 
				 	}
			
 
				 
			
@@ -918,12 +1016,6 @@ private:
 
				 	void* blocks[24];
			
 
				 	size_t count;
			
 
				 };
			
 
				-
			
 
				-// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
			
 
				-template <typename T>
			
 
				-void* (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
			
 
				-template <typename T>
			
 
				-void (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
			
 
				 #endif
			
 
				 
			
 
				 /* Inline implementation for C++ templated wrappers */
			
@@ -945,7 +1037,7 @@ struct meshopt_IndexAdapter<T, false>
 
				 	{
			
 
				 		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
			
 
				 
			
 
				-		data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
			
 
				+		data = static_cast<unsigned int*>(meshopt_Allocator::storage().allocate(size));
			
 
				 
			
 
				 		if (input)
			
 
				 		{
			
@@ -962,7 +1054,7 @@ struct meshopt_IndexAdapter<T, false>
 
				 				result[i] = T(data[i]);
			
 
				 		}
			
 
				 
			
 
				-		meshopt_Allocator::Storage::deallocate(data);
			
 
				+		meshopt_Allocator::storage().deallocate(data);
			
 
				 	}
			
 
				 };
			
 
				 
			
@@ -1161,6 +1253,11 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
 
				 	return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
			
 
				 }
			
 
				 
			
 
				+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level)
			
 
				+{
			
 
				+	return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, level, -1);
			
 
				+}
			
 
				+
			
 
				 template <typename T>
			
 
				 inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
			
 
				 {
			
@@ -1179,13 +1276,30 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s
 
				 	return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
			
 
				 }
			
 
				 
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
			
 
				+
			
 
				+	return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
			
 
				+}
			
 
				+
			
 
				 template <typename T>
			
 
				 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
			
 
				 {
			
 
				 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				 
			
 
				-	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
			
 
				+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	return meshopt_simplifyPrune(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_error);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
@@ -1263,11 +1377,11 @@ inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int*
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				-inline size_t meshopt_buildMeshletsSplit(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
			
 
				+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
			
 
				 {
			
 
				 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 
			
 
				-	return meshopt_buildMeshletsSplit(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight);
			
 
				+	return meshopt_buildMeshletsSpatial(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
--- a/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp
+++ b/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp
@@ -10,24 +10,24 @@
 
				 namespace meshopt
			
 
				 {
			
 
				 
			
 
				-static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
			
 
				+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
			
 
				 {
			
 
				 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				 
			
 
				 	float mesh_centroid[3] = {};
			
 
				 
			
 
				-	for (size_t i = 0; i < index_count; ++i)
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				 	{
			
 
				-		const float* p = vertex_positions + vertex_stride_float * indices[i];
			
 
				+		const float* p = vertex_positions + vertex_stride_float * i;
			
 
				 
			
 
				 		mesh_centroid[0] += p[0];
			
 
				 		mesh_centroid[1] += p[1];
			
 
				 		mesh_centroid[2] += p[2];
			
 
				 	}
			
 
				 
			
 
				-	mesh_centroid[0] /= index_count;
			
 
				-	mesh_centroid[1] /= index_count;
			
 
				-	mesh_centroid[2] /= index_count;
			
 
				+	mesh_centroid[0] /= float(vertex_count);
			
 
				+	mesh_centroid[1] /= float(vertex_count);
			
 
				+	mesh_centroid[2] /= float(vertex_count);
			
 
				 
			
 
				 	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
			
 
				 	{
			
@@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
 
				 
			
 
				 	// fill sort data
			
 
				 	float* sort_data = allocator.allocate<float>(cluster_count);
			
 
				-	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
			
 
				+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count);
			
 
				 
			
 
				 	// sort clusters using sort data
			
 
				 	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
			
--- a/3rdparty/meshoptimizer/src/partition.cpp
+++ b/3rdparty/meshoptimizer/src/partition.cpp
@@ -5,6 +5,8 @@
 
				 #include <math.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				+// This work is based on:
			
 
				+// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991
			
 
				 namespace meshopt
			
 
				 {
			
 
				 
			
--- a/3rdparty/meshoptimizer/src/simplifier.cpp
+++ b/3rdparty/meshoptimizer/src/simplifier.cpp
--- a/3rdparty/meshoptimizer/src/spatialorder.cpp
+++ b/3rdparty/meshoptimizer/src/spatialorder.cpp
@@ -22,7 +22,7 @@ inline unsigned long long part1By2(unsigned long long x)
 
				 	return x;
			
 
				 }
			
 
				 
			
 
				-static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton)
			
 
				 {
			
 
				 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				 
			
@@ -60,61 +60,158 @@ static void computeOrder(unsigned long long* result, const float* vertex_positio
 
				 		int y = int((v[1] - minv[1]) * scale + 0.5f);
			
 
				 		int z = int((v[2] - minv[2]) * scale + 0.5f);
			
 
				 
			
 
				-		result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
			
 
				+		if (morton)
			
 
				+			result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
			
 
				+		else
			
 
				+			result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void computeHistogram(unsigned int (&hist)[1024][5], const unsigned long long* data, size_t count)
			
 
				+static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count)
			
 
				 {
			
 
				+	unsigned int hist[1024];
			
 
				 	memset(hist, 0, sizeof(hist));
			
 
				 
			
 
				-	// compute 5 10-bit histograms in parallel
			
 
				+	// compute histogram (assume keys are 10-bit)
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+		hist[keys[i]]++;
			
 
				+
			
 
				+	unsigned int sum = 0;
			
 
				+
			
 
				+	// replace histogram data with prefix histogram sums in-place
			
 
				+	for (int i = 0; i < 1024; ++i)
			
 
				+	{
			
 
				+		unsigned int h = hist[i];
			
 
				+		hist[i] = sum;
			
 
				+		sum += h;
			
 
				+	}
			
 
				+
			
 
				+	assert(sum == count);
			
 
				+
			
 
				+	// reorder values
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		unsigned int id = keys[source[i]];
			
 
				+
			
 
				+		destination[hist[id]++] = source[i];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count)
			
 
				+{
			
 
				+	memset(hist, 0, sizeof(hist));
			
 
				+
			
 
				+	// compute 2 8-bit histograms in parallel
			
 
				 	for (size_t i = 0; i < count; ++i)
			
 
				 	{
			
 
				 		unsigned long long id = data[i];
			
 
				 
			
 
				-		hist[(id >> 0) & 1023][0]++;
			
 
				-		hist[(id >> 10) & 1023][1]++;
			
 
				-		hist[(id >> 20) & 1023][2]++;
			
 
				-		hist[(id >> 30) & 1023][3]++;
			
 
				-		hist[(id >> 40) & 1023][4]++;
			
 
				+		hist[(id >> 0) & 255][0]++;
			
 
				+		hist[(id >> 8) & 255][1]++;
			
 
				 	}
			
 
				 
			
 
				-	unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0;
			
 
				+	unsigned int sum0 = 0, sum1 = 0;
			
 
				 
			
 
				 	// replace histogram data with prefix histogram sums in-place
			
 
				-	for (int i = 0; i < 1024; ++i)
			
 
				+	for (int i = 0; i < 256; ++i)
			
 
				 	{
			
 
				-		unsigned int h0 = hist[i][0], h1 = hist[i][1], h2 = hist[i][2], h3 = hist[i][3], h4 = hist[i][4];
			
 
				+		unsigned int h0 = hist[i][0], h1 = hist[i][1];
			
 
				 
			
 
				 		hist[i][0] = sum0;
			
 
				 		hist[i][1] = sum1;
			
 
				-		hist[i][2] = sum2;
			
 
				-		hist[i][3] = sum3;
			
 
				-		hist[i][4] = sum4;
			
 
				 
			
 
				 		sum0 += h0;
			
 
				 		sum1 += h1;
			
 
				-		sum2 += h2;
			
 
				-		sum3 += h3;
			
 
				-		sum4 += h4;
			
 
				 	}
			
 
				 
			
 
				-	assert(sum0 == count && sum1 == count && sum2 == count && sum3 == count && sum4 == count);
			
 
				+	assert(sum0 == count && sum1 == count);
			
 
				 }
			
 
				 
			
 
				-static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned long long* keys, size_t count, unsigned int (&hist)[1024][5], int pass)
			
 
				+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass)
			
 
				 {
			
 
				-	int bitoff = pass * 10;
			
 
				+	int bitoff = pass * 8;
			
 
				 
			
 
				 	for (size_t i = 0; i < count; ++i)
			
 
				 	{
			
 
				-		unsigned int id = unsigned(keys[source[i]] >> bitoff) & 1023;
			
 
				+		unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255;
			
 
				 
			
 
				 		destination[hist[id][pass]++] = source[i];
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
			
 
				+{
			
 
				+	size_t l = 0, r = split;
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		unsigned char side = sides[order[i]];
			
 
				+		target[side ? r : l] = order[i];
			
 
				+		l += 1;
			
 
				+		l -= side;
			
 
				+		r += side;
			
 
				+	}
			
 
				+
			
 
				+	assert(l == split && r == count);
			
 
				+}
			
 
				+
			
 
				+static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size)
			
 
				+{
			
 
				+	if (count <= cluster_size)
			
 
				+	{
			
 
				+		memcpy(destination, orderx, count * sizeof(unsigned int));
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int* axes[3] = {orderx, ordery, orderz};
			
 
				+
			
 
				+	int bestk = -1;
			
 
				+	unsigned int bestdim = 0;
			
 
				+
			
 
				+	for (int k = 0; k < 3; ++k)
			
 
				+	{
			
 
				+		const unsigned int mask = (1 << 20) - 1;
			
 
				+		unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask);
			
 
				+
			
 
				+		if (dim >= bestdim)
			
 
				+		{
			
 
				+			bestk = k;
			
 
				+			bestdim = dim;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	assert(bestk >= 0);
			
 
				+
			
 
				+	// split roughly in half, with the left split always being aligned to cluster size
			
 
				+	size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size;
			
 
				+	assert(split > 0 && split < count);
			
 
				+
			
 
				+	// mark sides of split for partitioning
			
 
				+	unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
			
 
				+
			
 
				+	for (size_t i = 0; i < split; ++i)
			
 
				+		sides[axes[bestk][i]] = 0;
			
 
				+
			
 
				+	for (size_t i = split; i < count; ++i)
			
 
				+		sides[axes[bestk][i]] = 1;
			
 
				+
			
 
				+	// partition all axes into two sides, maintaining order
			
 
				+	unsigned int* temp = static_cast<unsigned int*>(scratch);
			
 
				+
			
 
				+	for (int k = 0; k < 3; ++k)
			
 
				+	{
			
 
				+		if (k == bestk)
			
 
				+			continue;
			
 
				+
			
 
				+		unsigned int* axis = axes[k];
			
 
				+		memcpy(temp, axis, sizeof(unsigned int) * count);
			
 
				+		partitionPoints(axis, temp, sides, split, count);
			
 
				+	}
			
 
				+
			
 
				+	splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
			
 
				+	splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
			
 
				+}
			
 
				+
			
 
				 } // namespace meshopt
			
 
				 
			
 
				 void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
@@ -127,22 +224,25 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
 
				 	meshopt_Allocator allocator;
			
 
				 
			
 
				 	unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
			
 
				-	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true);
			
 
				 
			
 
				-	unsigned int hist[1024][5];
			
 
				-	computeHistogram(hist, keys, vertex_count);
			
 
				-
			
 
				-	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 2b for keys
			
 
				+	unsigned short* keyk = (unsigned short*)(scratch + vertex_count);
			
 
				 
			
 
				 	for (size_t i = 0; i < vertex_count; ++i)
			
 
				 		destination[i] = unsigned(i);
			
 
				 
			
 
				+	unsigned int* order[] = {scratch, destination};
			
 
				+
			
 
				 	// 5-pass radix sort computes the resulting order into scratch
			
 
				-	radixPass(scratch, destination, keys, vertex_count, hist, 0);
			
 
				-	radixPass(destination, scratch, keys, vertex_count, hist, 1);
			
 
				-	radixPass(scratch, destination, keys, vertex_count, hist, 2);
			
 
				-	radixPass(destination, scratch, keys, vertex_count, hist, 3);
			
 
				-	radixPass(scratch, destination, keys, vertex_count, hist, 4);
			
 
				+	for (int k = 0; k < 5; ++k)
			
 
				+	{
			
 
				+		// copy 10-bit key segments into keyk to reduce cache pressure during radix pass
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+			keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023);
			
 
				+
			
 
				+		radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count);
			
 
				+	}
			
 
				 
			
 
				 	// since our remap table is mapping old=>new, we need to reverse it
			
 
				 	for (size_t i = 0; i < vertex_count; ++i)
			
@@ -202,3 +302,39 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
 
				 		destination[r * 3 + 2] = c;
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+	assert(cluster_size > 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
			
 
				+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false);
			
 
				+
			
 
				+	unsigned int* order = allocator.allocate<unsigned int>(vertex_count * 3);
			
 
				+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 1b for side or 2b for keys
			
 
				+	unsigned short* keyk = reinterpret_cast<unsigned short*>(scratch + vertex_count);
			
 
				+
			
 
				+	for (int k = 0; k < 3; ++k)
			
 
				+	{
			
 
				+		// copy 16-bit key segments into keyk to reduce cache pressure during radix pass
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+			keyk[i] = (unsigned short)(keys[i] >> (k * 20));
			
 
				+
			
 
				+		unsigned int hist[256][2];
			
 
				+		computeHistogram(hist, keyk, vertex_count);
			
 
				+
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+			order[k * vertex_count + i] = unsigned(i);
			
 
				+
			
 
				+		radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0);
			
 
				+		radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1);
			
 
				+	}
			
 
				+
			
 
				+	splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size);
			
 
				+}
			
--- a/3rdparty/meshoptimizer/src/vertexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp
@@ -122,7 +122,7 @@ namespace meshopt
 
				 
			
 
				 const unsigned char kVertexHeader = 0xa0;
			
 
				 
			
 
				-static int gEncodeVertexVersion = 0;
			
 
				+static int gEncodeVertexVersion = 1;
			
 
				 const int kDecodeVertexVersion = 1;
			
 
				 
			
 
				 const size_t kVertexBlockSizeBytes = 8192;
			
--- a/3rdparty/meshoptimizer/src/vertexfilter.cpp
+++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp
@@ -165,6 +165,47 @@ static void decodeFilterExp(unsigned int* data, size_t count)
 
				 		data[i] = u.ui;
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+template <typename ST, typename T>
			
 
				+static void decodeFilterColor(T* data, size_t count)
			
 
				+{
			
 
				+	const float max = float((1 << (sizeof(T) * 8)) - 1);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		// recover scale from alpha high bit
			
 
				+		int as = data[i * 4 + 3];
			
 
				+		as |= as >> 1;
			
 
				+		as |= as >> 2;
			
 
				+		as |= as >> 4;
			
 
				+		as |= as >> 8; // noop for 8-bit
			
 
				+
			
 
				+		// convert to RGB in fixed point (co/cg are sign extended)
			
 
				+		int y = data[i * 4 + 0], co = ST(data[i * 4 + 1]), cg = ST(data[i * 4 + 2]);
			
 
				+
			
 
				+		int r = y + co - cg;
			
 
				+		int g = y + cg;
			
 
				+		int b = y - co - cg;
			
 
				+
			
 
				+		// expand alpha by one bit to match other components
			
 
				+		int a = data[i * 4 + 3];
			
 
				+		a = ((a << 1) & as) | (a & 1);
			
 
				+
			
 
				+		// compute scaling factor
			
 
				+		float ss = max / float(as);
			
 
				+
			
 
				+		// rounded float->int
			
 
				+		int rf = int(float(r) * ss + 0.5f);
			
 
				+		int gf = int(float(g) * ss + 0.5f);
			
 
				+		int bf = int(float(b) * ss + 0.5f);
			
 
				+		int af = int(float(a) * ss + 0.5f);
			
 
				+
			
 
				+		data[i * 4 + 0] = T(rf);
			
 
				+		data[i * 4 + 1] = T(gf);
			
 
				+		data[i * 4 + 2] = T(bf);
			
 
				+		data[i * 4 + 3] = T(af);
			
 
				+	}
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
@@ -386,6 +427,105 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 
				 		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128i c4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
			
 
				+
			
 
				+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
			
 
				+		__m128i yf = _mm_and_si128(c4, _mm_set1_epi32(0xff));
			
 
				+		__m128i cof = _mm_srai_epi32(_mm_slli_epi32(c4, 16), 24);
			
 
				+		__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4, 8), 24);
			
 
				+		__m128i af = _mm_srli_epi32(c4, 24);
			
 
				+
			
 
				+		// recover scale from alpha high bit
			
 
				+		__m128i as = af;
			
 
				+		as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
			
 
				+		as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
			
 
				+		as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
			
 
				+
			
 
				+		// expand alpha by one bit to match other components
			
 
				+		af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
			
 
				+
			
 
				+		// compute scaling factor
			
 
				+		__m128 ss = _mm_mul_ps(_mm_set1_ps(255.f), _mm_rcp_ps(_mm_cvtepi32_ps(as)));
			
 
				+
			
 
				+		// convert to RGB in fixed point
			
 
				+		__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
			
 
				+		__m128i gf = _mm_add_epi32(yf, cgf);
			
 
				+		__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
			
 
				+		__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
			
 
				+		__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
			
 
				+		__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
			
 
				+
			
 
				+		// repack rgba into final value
			
 
				+		__m128i res = rr;
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(gr, 8));
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(br, 16));
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(ar, 24));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128i c4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]));
			
 
				+		__m128i c4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both y/co 16-bit pairs in each 32-bit lane
			
 
				+		__m128i c4_yco = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(2, 0, 2, 0)));
			
 
				+		__m128i c4_cga = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(3, 1, 3, 1)));
			
 
				+
			
 
				+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
			
 
				+		__m128i yf = _mm_and_si128(c4_yco, _mm_set1_epi32(0xffff));
			
 
				+		__m128i cof = _mm_srai_epi32(c4_yco, 16);
			
 
				+		__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4_cga, 16), 16);
			
 
				+		__m128i af = _mm_srli_epi32(c4_cga, 16);
			
 
				+
			
 
				+		// recover scale from alpha high bit
			
 
				+		__m128i as = af;
			
 
				+		as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
			
 
				+		as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
			
 
				+		as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
			
 
				+		as = _mm_or_si128(as, _mm_srli_epi32(as, 8));
			
 
				+
			
 
				+		// expand alpha by one bit to match other components
			
 
				+		af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
			
 
				+
			
 
				+		// compute scaling factor
			
 
				+		__m128 ss = _mm_div_ps(_mm_set1_ps(65535.f), _mm_cvtepi32_ps(as));
			
 
				+
			
 
				+		// convert to RGB in fixed point
			
 
				+		__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
			
 
				+		__m128i gf = _mm_add_epi32(yf, cgf);
			
 
				+		__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
			
 
				+		__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
			
 
				+		__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
			
 
				+		__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
			
 
				+
			
 
				+		// mix r/b and g/a to make 16-bit unpack easier
			
 
				+		__m128i rbr = _mm_or_si128(_mm_and_si128(rr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(br, 16));
			
 
				+		__m128i gar = _mm_or_si128(_mm_and_si128(gr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(ar, 16));
			
 
				+
			
 
				+		// pack r/g/b/a using 16-bit unpacks
			
 
				+		__m128i res_0 = _mm_unpacklo_epi16(rbr, gar);
			
 
				+		__m128i res_1 = _mm_unpackhi_epi16(rbr, gar);
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
			
 
				+	}
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 #if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
			
@@ -596,6 +736,111 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 
				 		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t c4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
			
 
				+
			
 
				+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
			
 
				+		int32x4_t yf = vandq_s32(c4, vdupq_n_s32(0xff));
			
 
				+		int32x4_t cof = vshrq_n_s32(vshlq_n_s32(c4, 16), 24);
			
 
				+		int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4, 8), 24);
			
 
				+		int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4), 24));
			
 
				+
			
 
				+		// recover scale from alpha high bit
			
 
				+		int32x4_t as = af;
			
 
				+		as = vorrq_s32(as, vshrq_n_s32(as, 1));
			
 
				+		as = vorrq_s32(as, vshrq_n_s32(as, 2));
			
 
				+		as = vorrq_s32(as, vshrq_n_s32(as, 4));
			
 
				+
			
 
				+		// expand alpha by one bit to match other components
			
 
				+		af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
			
 
				+
			
 
				+		// compute scaling factor
			
 
				+		float32x4_t ss = vmulq_f32(vdupq_n_f32(255.f), vrecpeq_f32(vcvtq_f32_s32(as)));
			
 
				+
			
 
				+		// convert to RGB in fixed point
			
 
				+		int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
			
 
				+		int32x4_t gf = vaddq_s32(yf, cgf);
			
 
				+		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
			
 
				+		int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
			
 
				+		int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
			
 
				+		int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
			
 
				+
			
 
				+		// repack rgba into final value
			
 
				+		int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(ar, 24));
			
 
				+
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t c4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
			
 
				+		int32x4_t c4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both y/co 16-bit pairs in each 32-bit lane
			
 
				+		int32x4_t c4_yco = vuzpq_s32(c4_0, c4_1).val[0];
			
 
				+		int32x4_t c4_cga = vuzpq_s32(c4_0, c4_1).val[1];
			
 
				+
			
 
				+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
			
 
				+		int32x4_t yf = vandq_s32(c4_yco, vdupq_n_s32(0xffff));
			
 
				+		int32x4_t cof = vshrq_n_s32(c4_yco, 16);
			
 
				+		int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4_cga, 16), 16);
			
 
				+		int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4_cga), 16));
			
 
				+
			
 
				+		// recover scale from alpha high bit
			
 
				+		int32x4_t as = af;
			
 
				+		as = vorrq_s32(as, vshrq_n_s32(as, 1));
			
 
				+		as = vorrq_s32(as, vshrq_n_s32(as, 2));
			
 
				+		as = vorrq_s32(as, vshrq_n_s32(as, 4));
			
 
				+		as = vorrq_s32(as, vshrq_n_s32(as, 8));
			
 
				+
			
 
				+		// expand alpha by one bit to match other components
			
 
				+		af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
			
 
				+
			
 
				+		// compute scaling factor
			
 
				+		float32x4_t ss = vdivq_f32(vdupq_n_f32(65535.f), vcvtq_f32_s32(as));
			
 
				+
			
 
				+		// convert to RGB in fixed point
			
 
				+		int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
			
 
				+		int32x4_t gf = vaddq_s32(yf, cgf);
			
 
				+		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
			
 
				+		int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
			
 
				+		int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
			
 
				+		int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
			
 
				+
			
 
				+		// mix r/b and g/a to make 16-bit unpack easier
			
 
				+		int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16));
			
 
				+		int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16));
			
 
				+
			
 
				+		// pack r/g/b/a using 16-bit unpacks
			
 
				+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
			
 
				+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[1]);
			
 
				+
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
			
 
				+	}
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 #ifdef SIMD_WASM
			
@@ -651,7 +896,8 @@ static void decodeFilterOctSimd8(signed char* data, size_t count)
 
				 static void decodeFilterOctSimd16(short* data, size_t count)
			
 
				 {
			
 
				 	const v128_t sign = wasm_f32x4_splat(-0.f);
			
 
				-	const v128_t zmask = wasm_i32x4_splat(0x7fff);
			
 
				+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
			
 
				+	volatile v128_t zmask = wasm_i32x4_splat(0x7fff);
			
 
				 
			
 
				 	for (size_t i = 0; i < count; i += 4)
			
 
				 	{
			
@@ -763,8 +1009,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
				 		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
			
 
				 
			
 
				 		// compute component index shifted left by 4 (and moved into i32x4 slot)
			
 
				-		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
			
 
				-		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
			
 
				+		v128_t cm = wasm_i32x4_shl(cf, 4);
			
 
				 
			
 
				 		// rotate and store
			
 
				 		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
			
@@ -795,6 +1040,117 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 
				 		wasm_v128_store(&data[i], r);
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
			
 
				+{
			
 
				+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
			
 
				+	volatile v128_t zero = wasm_i32x4_splat(0);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t c4 = wasm_v128_load(&data[i * 4]);
			
 
				+
			
 
				+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
			
 
				+		v128_t yf = wasm_v128_and(c4, wasm_i32x4_splat(0xff));
			
 
				+		v128_t cof = wasm_i32x4_shr(wasm_i32x4_shl(c4, 16), 24);
			
 
				+		v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4, 8), 24);
			
 
				+		v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4, 24));
			
 
				+
			
 
				+		// recover scale from alpha high bit
			
 
				+		v128_t as = af;
			
 
				+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
			
 
				+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
			
 
				+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
			
 
				+
			
 
				+		// expand alpha by one bit to match other components
			
 
				+		af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
			
 
				+
			
 
				+		// compute scaling factor
			
 
				+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(255.f), wasm_f32x4_convert_i32x4(as));
			
 
				+
			
 
				+		// convert to RGB in fixed point
			
 
				+		v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
			
 
				+		v128_t gf = wasm_i32x4_add(yf, cgf);
			
 
				+		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
			
 
				+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				+
			
 
				+		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
			
 
				+		v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
			
 
				+		v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
			
 
				+		v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
			
 
				+
			
 
				+		// repack rgba into final value
			
 
				+		v128_t res = wasm_v128_and(rr, wasm_i32x4_splat(0xff));
			
 
				+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(gr, wasm_i32x4_splat(0xff)), 8));
			
 
				+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(br, wasm_i32x4_splat(0xff)), 16));
			
 
				+		res = wasm_v128_or(res, wasm_i32x4_shl(ar, 24));
			
 
				+
			
 
				+		wasm_v128_store(&data[i * 4], res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
			
 
				+{
			
 
				+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
			
 
				+	volatile v128_t zero = wasm_i32x4_splat(0);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t c4_0 = wasm_v128_load(&data[(i + 0) * 4]);
			
 
				+		v128_t c4_1 = wasm_v128_load(&data[(i + 2) * 4]);
			
 
				+
			
 
				+		// gather both y/co 16-bit pairs in each 32-bit lane
			
 
				+		v128_t c4_yco = wasmx_unziplo_v32x4(c4_0, c4_1);
			
 
				+		v128_t c4_cga = wasmx_unziphi_v32x4(c4_0, c4_1);
			
 
				+
			
 
				+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
			
 
				+		v128_t yf = wasm_v128_and(c4_yco, wasm_i32x4_splat(0xffff));
			
 
				+		v128_t cof = wasm_i32x4_shr(c4_yco, 16);
			
 
				+		v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4_cga, 16), 16);
			
 
				+		v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4_cga, 16));
			
 
				+
			
 
				+		// recover scale from alpha high bit
			
 
				+		v128_t as = af;
			
 
				+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
			
 
				+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
			
 
				+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
			
 
				+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 8));
			
 
				+
			
 
				+		// expand alpha by one bit to match other components
			
 
				+		af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
			
 
				+
			
 
				+		// compute scaling factor
			
 
				+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(65535.f), wasm_f32x4_convert_i32x4(as));
			
 
				+
			
 
				+		// convert to RGB in fixed point
			
 
				+		v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
			
 
				+		v128_t gf = wasm_i32x4_add(yf, cgf);
			
 
				+		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
			
 
				+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				+
			
 
				+		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
			
 
				+		v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
			
 
				+		v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
			
 
				+		v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
			
 
				+
			
 
				+		// mix r/b and g/a to make 16-bit unpack easier
			
 
				+		v128_t rbr = wasm_v128_or(wasm_v128_and(rr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(br, 16));
			
 
				+		v128_t gar = wasm_v128_or(wasm_v128_and(gr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(ar, 16));
			
 
				+
			
 
				+		// pack r/g/b/a using 16-bit unpacks
			
 
				+		v128_t res_0 = wasmx_unpacklo_v16x8(rbr, gar);
			
 
				+		v128_t res_1 = wasmx_unpackhi_v16x8(rbr, gar);
			
 
				+
			
 
				+		wasm_v128_store(&data[(i + 0) * 4], res_0);
			
 
				+		wasm_v128_store(&data[(i + 2) * 4], res_1);
			
 
				+	}
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 // optimized variant of frexp
			
@@ -872,6 +1228,25 @@ void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride == 4 || stride == 8);
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	if (stride == 4)
			
 
				+		dispatchSimd(decodeFilterColorSimd8, static_cast<unsigned char*>(buffer), count, 4);
			
 
				+	else
			
 
				+		dispatchSimd(decodeFilterColorSimd16, static_cast<unsigned short*>(buffer), count, 4);
			
 
				+#else
			
 
				+	if (stride == 4)
			
 
				+		decodeFilterColor<signed char>(static_cast<unsigned char*>(buffer), count);
			
 
				+	else
			
 
				+		decodeFilterColor<short>(static_cast<unsigned short*>(buffer), count);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
			
 
				 {
			
 
				 	assert(stride == 4 || stride == 8);
			
@@ -1042,6 +1417,51 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 
				 	}
			
 
				 }
			
 
				 
			
 
				+void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data)
			
 
				+{
			
 
				+	assert(stride == 4 || stride == 8);
			
 
				+	assert(bits >= 2 && bits <= 16);
			
 
				+
			
 
				+	unsigned char* d8 = static_cast<unsigned char*>(destination);
			
 
				+	unsigned short* d16 = static_cast<unsigned short*>(destination);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* c = &data[i * 4];
			
 
				+
			
 
				+		int fr = meshopt_quantizeUnorm(c[0], bits);
			
 
				+		int fg = meshopt_quantizeUnorm(c[1], bits);
			
 
				+		int fb = meshopt_quantizeUnorm(c[2], bits);
			
 
				+
			
 
				+		// YCoCg-R encoding with truncated Co/Cg ensures that decoding can be done using integers
			
 
				+		int fco = (fr - fb) / 2;
			
 
				+		int tmp = fb + fco;
			
 
				+		int fcg = (fg - tmp) / 2;
			
 
				+		int fy = tmp + fcg;
			
 
				+
			
 
				+		// validate that R/G/B can be reconstructed with K bit integers
			
 
				+		assert(unsigned((fy + fco - fcg) | (fy + fcg) | (fy - fco - fcg)) < (1u << bits));
			
 
				+
			
 
				+		// alpha: K-1-bit encoding with high bit set to 1
			
 
				+		int fa = meshopt_quantizeUnorm(c[3], bits - 1) | (1 << (bits - 1));
			
 
				+
			
 
				+		if (stride == 4)
			
 
				+		{
			
 
				+			d8[i * 4 + 0] = (unsigned char)(fy);
			
 
				+			d8[i * 4 + 1] = (unsigned char)(fco);
			
 
				+			d8[i * 4 + 2] = (unsigned char)(fcg);
			
 
				+			d8[i * 4 + 3] = (unsigned char)(fa);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			d16[i * 4 + 0] = (unsigned short)(fy);
			
 
				+			d16[i * 4 + 1] = (unsigned short)(fco);
			
 
				+			d16[i * 4 + 2] = (unsigned short)(fcg);
			
 
				+			d16[i * 4 + 3] = (unsigned short)(fa);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 #undef SIMD_SSE
			
 
				 #undef SIMD_NEON
			
 
				 #undef SIMD_WASM