2 years ago · 4138e34f6a
--- a/ThirdParty/MeshOptimizer/allocator.cpp
+++ b/ThirdParty/MeshOptimizer/allocator.cpp
@@ -1,6 +1,7 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				 #include "meshoptimizer.h"
			
 
				 
			
 
				-void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*))
			
 
				+void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*))
			
 
				 {
			
 
				 	meshopt_Allocator::Storage::allocate = allocate;
			
 
				 	meshopt_Allocator::Storage::deallocate = deallocate;
			
--- a/ThirdParty/MeshOptimizer/clusterizer.cpp
+++ b/ThirdParty/MeshOptimizer/clusterizer.cpp
@@ -2,6 +2,7 @@
 
				 #include "meshoptimizer.h"
			
 
				 
			
 
				 #include <assert.h>
			
 
				+#include <float.h>
			
 
				 #include <math.h>
			
 
				 #include <string.h>
			
 
				 
			
@@ -12,6 +13,68 @@
 
				 namespace meshopt
			
 
				 {
			
 
				 
			
 
				+// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
			
 
				+const size_t kMeshletMaxVertices = 255;
			
 
				+
			
 
				+// A reasonable limit is around 2*max_vertices or less
			
 
				+const size_t kMeshletMaxTriangles = 512;
			
 
				+
			
 
				+struct TriangleAdjacency2
			
 
				+{
			
 
				+	unsigned int* counts;
			
 
				+	unsigned int* offsets;
			
 
				+	unsigned int* data;
			
 
				+};
			
 
				+
			
 
				+static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
			
 
				+{
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	// allocate arrays
			
 
				+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	adjacency.data = allocator.allocate<unsigned int>(index_count);
			
 
				+
			
 
				+	// fill triangle counts
			
 
				+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		assert(indices[i] < vertex_count);
			
 
				+
			
 
				+		adjacency.counts[indices[i]]++;
			
 
				+	}
			
 
				+
			
 
				+	// fill offset table
			
 
				+	unsigned int offset = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		adjacency.offsets[i] = offset;
			
 
				+		offset += adjacency.counts[i];
			
 
				+	}
			
 
				+
			
 
				+	assert(offset == index_count);
			
 
				+
			
 
				+	// fill triangle data
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
			
 
				+
			
 
				+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
			
 
				+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
			
 
				+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
			
 
				+	}
			
 
				+
			
 
				+	// fix offsets that have been disturbed by the previous pass
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
			
 
				+
			
 
				+		adjacency.offsets[i] -= adjacency.counts[i];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
			
 
				 {
			
 
				 	assert(count > 0);
			
@@ -82,13 +145,382 @@ static void computeBoundingSphere(float result[4], const float points[][3], size
 
				 	result[3] = radius;
			
 
				 }
			
 
				 
			
 
				+struct Cone
			
 
				+{
			
 
				+	float px, py, pz;
			
 
				+	float nx, ny, nz;
			
 
				+};
			
 
				+
			
 
				+static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
			
 
				+{
			
 
				+	float cone = 1.f - spread * cone_weight;
			
 
				+	float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
			
 
				+
			
 
				+	return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
			
 
				+}
			
 
				+
			
 
				+static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
			
 
				+{
			
 
				+	Cone result = acc;
			
 
				+
			
 
				+	float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
			
 
				+
			
 
				+	result.px *= center_scale;
			
 
				+	result.py *= center_scale;
			
 
				+	result.pz *= center_scale;
			
 
				+
			
 
				+	float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
			
 
				+	float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
			
 
				+
			
 
				+	result.nx *= axis_scale;
			
 
				+	result.ny *= axis_scale;
			
 
				+	result.nz *= axis_scale;
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	(void)vertex_count;
			
 
				+
			
 
				+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	float mesh_area = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		const float* p0 = vertex_positions + vertex_stride_float * a;
			
 
				+		const float* p1 = vertex_positions + vertex_stride_float * b;
			
 
				+		const float* p2 = vertex_positions + vertex_stride_float * c;
			
 
				+
			
 
				+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
			
 
				+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
			
 
				+
			
 
				+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
			
 
				+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
			
 
				+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
			
 
				+
			
 
				+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
			
 
				+		float invarea = (area == 0.f) ? 0.f : 1.f / area;
			
 
				+
			
 
				+		triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
			
 
				+		triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
			
 
				+		triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
			
 
				+
			
 
				+		triangles[i].nx = normalx * invarea;
			
 
				+		triangles[i].ny = normaly * invarea;
			
 
				+		triangles[i].nz = normalz * invarea;
			
 
				+
			
 
				+		mesh_area += area;
			
 
				+	}
			
 
				+
			
 
				+	return mesh_area;
			
 
				+}
			
 
				+
			
 
				+static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
			
 
				+{
			
 
				+	size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
			
 
				+
			
 
				+	// fill 4b padding with 0
			
 
				+	while (offset & 3)
			
 
				+		meshlet_triangles[offset++] = 0;
			
 
				+}
			
 
				+
			
 
				+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
			
 
				+{
			
 
				+	unsigned char& av = used[a];
			
 
				+	unsigned char& bv = used[b];
			
 
				+	unsigned char& cv = used[c];
			
 
				+
			
 
				+	bool result = false;
			
 
				+
			
 
				+	unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
			
 
				+
			
 
				+	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
			
 
				+	{
			
 
				+		meshlets[meshlet_offset] = meshlet;
			
 
				+
			
 
				+		for (size_t j = 0; j < meshlet.vertex_count; ++j)
			
 
				+			used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
			
 
				+
			
 
				+		finishMeshlet(meshlet, meshlet_triangles);
			
 
				+
			
 
				+		meshlet.vertex_offset += meshlet.vertex_count;
			
 
				+		meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
			
 
				+		meshlet.vertex_count = 0;
			
 
				+		meshlet.triangle_count = 0;
			
 
				+
			
 
				+		result = true;
			
 
				+	}
			
 
				+
			
 
				+	if (av == 0xff)
			
 
				+	{
			
 
				+		av = (unsigned char)meshlet.vertex_count;
			
 
				+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
			
 
				+	}
			
 
				+
			
 
				+	if (bv == 0xff)
			
 
				+	{
			
 
				+		bv = (unsigned char)meshlet.vertex_count;
			
 
				+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
			
 
				+	}
			
 
				+
			
 
				+	if (cv == 0xff)
			
 
				+	{
			
 
				+		cv = (unsigned char)meshlet.vertex_count;
			
 
				+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
			
 
				+	}
			
 
				+
			
 
				+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
			
 
				+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
			
 
				+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
			
 
				+	meshlet.triangle_count++;
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra)
			
 
				+{
			
 
				+	unsigned int best_triangle = ~0u;
			
 
				+	unsigned int best_extra = 5;
			
 
				+	float best_score = FLT_MAX;
			
 
				+
			
 
				+	for (size_t i = 0; i < meshlet.vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
			
 
				+
			
 
				+		unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
			
 
				+		size_t neighbors_size = adjacency.counts[index];
			
 
				+
			
 
				+		for (size_t j = 0; j < neighbors_size; ++j)
			
 
				+		{
			
 
				+			unsigned int triangle = neighbors[j];
			
 
				+			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
			
 
				+
			
 
				+			unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
			
 
				+
			
 
				+			// triangles that don't add new vertices to meshlets are max. priority
			
 
				+			if (extra != 0)
			
 
				+			{
			
 
				+				// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
			
 
				+				if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
			
 
				+					extra = 0;
			
 
				+
			
 
				+				extra++;
			
 
				+			}
			
 
				+
			
 
				+			// since topology-based priority is always more important than the score, we can skip scoring in some cases
			
 
				+			if (extra > best_extra)
			
 
				+				continue;
			
 
				+
			
 
				+			float score = 0;
			
 
				+
			
 
				+			// caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
			
 
				+			if (meshlet_cone)
			
 
				+			{
			
 
				+				const Cone& tri_cone = triangles[triangle];
			
 
				+
			
 
				+				float distance2 =
			
 
				+				    (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
			
 
				+				    (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
			
 
				+				    (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
			
 
				+
			
 
				+				float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
			
 
				+
			
 
				+				score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// each live_triangles entry is >= 1 since it includes the current triangle we're processing
			
 
				+				score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
			
 
				+			}
			
 
				+
			
 
				+			// note that topology-based priority is always more important than the score
			
 
				+			// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
			
 
				+			if (extra < best_extra || score < best_score)
			
 
				+			{
			
 
				+				best_triangle = triangle;
			
 
				+				best_extra = extra;
			
 
				+				best_score = score;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (out_extra)
			
 
				+		*out_extra = best_extra;
			
 
				+
			
 
				+	return best_triangle;
			
 
				+}
			
 
				+
			
 
				+struct KDNode
			
 
				+{
			
 
				+	union
			
 
				+	{
			
 
				+		float split;
			
 
				+		unsigned int index;
			
 
				+	};
			
 
				+
			
 
				+	// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
			
 
				+	// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
			
 
				+	unsigned int axis : 2;
			
 
				+	unsigned int children : 30;
			
 
				+};
			
 
				+
			
 
				+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
			
 
				+{
			
 
				+	size_t m = 0;
			
 
				+
			
 
				+	// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		float v = points[indices[i] * stride + axis];
			
 
				+
			
 
				+		// swap(m, i) unconditionally
			
 
				+		unsigned int t = indices[m];
			
 
				+		indices[m] = indices[i];
			
 
				+		indices[i] = t;
			
 
				+
			
 
				+		// when v >= pivot, we swap i with m without advancing it, preserving invariants
			
 
				+		m += v < pivot;
			
 
				+	}
			
 
				+
			
 
				+	return m;
			
 
				+}
			
 
				+
			
 
				+static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
			
 
				+{
			
 
				+	assert(offset + count <= node_count);
			
 
				+	(void)node_count;
			
 
				+
			
 
				+	KDNode& result = nodes[offset];
			
 
				+
			
 
				+	result.index = indices[0];
			
 
				+	result.axis = 3;
			
 
				+	result.children = unsigned(count - 1);
			
 
				+
			
 
				+	// all remaining points are stored in nodes immediately following the leaf
			
 
				+	for (size_t i = 1; i < count; ++i)
			
 
				+	{
			
 
				+		KDNode& tail = nodes[offset + i];
			
 
				+
			
 
				+		tail.index = indices[i];
			
 
				+		tail.axis = 3;
			
 
				+		tail.children = ~0u >> 2; // bogus value to prevent misuse
			
 
				+	}
			
 
				+
			
 
				+	return offset + count;
			
 
				+}
			
 
				+
			
 
				+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
			
 
				+{
			
 
				+	assert(count > 0);
			
 
				+	assert(offset < node_count);
			
 
				+
			
 
				+	if (count <= leaf_size)
			
 
				+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
			
 
				+
			
 
				+	float mean[3] = {};
			
 
				+	float vars[3] = {};
			
 
				+	float runc = 1, runs = 1;
			
 
				+
			
 
				+	// gather statistics on the points in the subtree using Welford's algorithm
			
 
				+	for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
			
 
				+	{
			
 
				+		const float* point = points + indices[i] * stride;
			
 
				+
			
 
				+		for (int k = 0; k < 3; ++k)
			
 
				+		{
			
 
				+			float delta = point[k] - mean[k];
			
 
				+			mean[k] += delta * runs;
			
 
				+			vars[k] += delta * (point[k] - mean[k]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// split axis is one where the variance is largest
			
 
				+	unsigned int axis = vars[0] >= vars[1] && vars[0] >= vars[2] ? 0 : vars[1] >= vars[2] ? 1 : 2;
			
 
				+
			
 
				+	float split = mean[axis];
			
 
				+	size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
			
 
				+
			
 
				+	// when the partition is degenerate simply consolidate the points into a single node
			
 
				+	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
			
 
				+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
			
 
				+
			
 
				+	KDNode& result = nodes[offset];
			
 
				+
			
 
				+	result.split = split;
			
 
				+	result.axis = axis;
			
 
				+
			
 
				+	// left subtree is right after our node
			
 
				+	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
			
 
				+
			
 
				+	// distance to the right subtree is represented explicitly
			
 
				+	result.children = unsigned(next_offset - offset - 1);
			
 
				+
			
 
				+	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
			
 
				+}
			
 
				+
			
 
				+static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
			
 
				+{
			
 
				+	const KDNode& node = nodes[root];
			
 
				+
			
 
				+	if (node.axis == 3)
			
 
				+	{
			
 
				+		// leaf
			
 
				+		for (unsigned int i = 0; i <= node.children; ++i)
			
 
				+		{
			
 
				+			unsigned int index = nodes[root + i].index;
			
 
				+
			
 
				+			if (emitted_flags[index])
			
 
				+				continue;
			
 
				+
			
 
				+			const float* point = points + index * stride;
			
 
				+
			
 
				+			float distance2 =
			
 
				+			    (point[0] - position[0]) * (point[0] - position[0]) +
			
 
				+			    (point[1] - position[1]) * (point[1] - position[1]) +
			
 
				+			    (point[2] - position[2]) * (point[2] - position[2]);
			
 
				+			float distance = sqrtf(distance2);
			
 
				+
			
 
				+			if (distance < limit)
			
 
				+			{
			
 
				+				result = index;
			
 
				+				limit = distance;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// branch; we order recursion to process the node that search position is in first
			
 
				+		float delta = position[node.axis] - node.split;
			
 
				+		unsigned int first = (delta <= 0) ? 0 : node.children;
			
 
				+		unsigned int second = first ^ node.children;
			
 
				+
			
 
				+		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
			
 
				+
			
 
				+		// only process the other node if it can have a match based on closest distance so far
			
 
				+		if (fabsf(delta) <= limit)
			
 
				+			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 } // namespace meshopt
			
 
				 
			
 
				 size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
			
 
				 {
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(max_vertices >= 3);
			
 
				-	assert(max_triangles >= 1);
			
 
				+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
			
 
				+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				+
			
 
				+	(void)kMeshletMaxVertices;
			
 
				+	(void)kMeshletMaxTriangles;
			
 
				 
			
 
				 	// meshlet construction is limited by max vertices and max triangles per meshlet
			
 
				 	// the worst case is that the input is an unindexed stream since this equally stresses both limits
			
@@ -100,77 +532,181 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
 
				 	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
			
 
				 }
			
 
				 
			
 
				-size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
			
 
				+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
			
 
				 {
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(max_vertices >= 3);
			
 
				-	assert(max_triangles >= 1);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
			
 
				+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				+
			
 
				+	assert(cone_weight >= 0 && cone_weight <= 1);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
 
				 
			
 
				-	meshopt_Meshlet meshlet;
			
 
				-	memset(&meshlet, 0, sizeof(meshlet));
			
 
				+	TriangleAdjacency2 adjacency = {};
			
 
				+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
			
 
				+
			
 
				+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
			
 
				+	memset(emitted_flags, 0, face_count);
			
 
				 
			
 
				-	assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
			
 
				-	assert(max_triangles <= sizeof(meshlet.indices) / 3);
			
 
				+	// for each triangle, precompute centroid & normal to use for scoring
			
 
				+	Cone* triangles = allocator.allocate<Cone>(face_count);
			
 
				+	float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+
			
 
				+	// assuming each meshlet is a square patch, expected radius is sqrt(expected area)
			
 
				+	float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
			
 
				+	float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
			
 
				+
			
 
				+	// build a kd-tree for nearest neighbor lookup
			
 
				+	unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+		kdindices[i] = unsigned(i);
			
 
				+
			
 
				+	KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
			
 
				+	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
			
 
				 
			
 
				 	// index of the vertex in the meshlet, 0xff if the vertex isn't used
			
 
				 	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
			
 
				 	memset(used, -1, vertex_count);
			
 
				 
			
 
				-	size_t offset = 0;
			
 
				+	meshopt_Meshlet meshlet = {};
			
 
				+	size_t meshlet_offset = 0;
			
 
				 
			
 
				-	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	Cone meshlet_cone_acc = {};
			
 
				+
			
 
				+	for (;;)
			
 
				 	{
			
 
				-		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
			
 
				-		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
			
 
				 
			
 
				-		unsigned char& av = used[a];
			
 
				-		unsigned char& bv = used[b];
			
 
				-		unsigned char& cv = used[c];
			
 
				+		unsigned int best_extra = 0;
			
 
				+		unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra);
			
 
				 
			
 
				-		unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
			
 
				+		// if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
			
 
				+		if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
			
 
				+		{
			
 
				+			best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL);
			
 
				+		}
			
 
				 
			
 
				-		if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
			
 
				+		// when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
			
 
				+		if (best_triangle == ~0u)
			
 
				 		{
			
 
				-			destination[offset++] = meshlet;
			
 
				+			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
			
 
				+			unsigned int index = ~0u;
			
 
				+			float limit = FLT_MAX;
			
 
				 
			
 
				-			for (size_t j = 0; j < meshlet.vertex_count; ++j)
			
 
				-				used[meshlet.vertices[j]] = 0xff;
			
 
				+			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
			
 
				 
			
 
				-			memset(&meshlet, 0, sizeof(meshlet));
			
 
				+			best_triangle = index;
			
 
				 		}
			
 
				 
			
 
				-		if (av == 0xff)
			
 
				-		{
			
 
				-			av = meshlet.vertex_count;
			
 
				-			meshlet.vertices[meshlet.vertex_count++] = a;
			
 
				-		}
			
 
				+		if (best_triangle == ~0u)
			
 
				+			break;
			
 
				 
			
 
				-		if (bv == 0xff)
			
 
				+		unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
			
 
				+		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
			
 
				 		{
			
 
				-			bv = meshlet.vertex_count;
			
 
				-			meshlet.vertices[meshlet.vertex_count++] = b;
			
 
				+			meshlet_offset++;
			
 
				+			memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
			
 
				 		}
			
 
				 
			
 
				-		if (cv == 0xff)
			
 
				+		live_triangles[a]--;
			
 
				+		live_triangles[b]--;
			
 
				+		live_triangles[c]--;
			
 
				+
			
 
				+		// remove emitted triangle from adjacency data
			
 
				+		// this makes sure that we spend less time traversing these lists on subsequent iterations
			
 
				+		for (size_t k = 0; k < 3; ++k)
			
 
				 		{
			
 
				-			cv = meshlet.vertex_count;
			
 
				-			meshlet.vertices[meshlet.vertex_count++] = c;
			
 
				+			unsigned int index = indices[best_triangle * 3 + k];
			
 
				+
			
 
				+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
			
 
				+			size_t neighbors_size = adjacency.counts[index];
			
 
				+
			
 
				+			for (size_t i = 0; i < neighbors_size; ++i)
			
 
				+			{
			
 
				+				unsigned int tri = neighbors[i];
			
 
				+
			
 
				+				if (tri == best_triangle)
			
 
				+				{
			
 
				+					neighbors[i] = neighbors[neighbors_size - 1];
			
 
				+					adjacency.counts[index]--;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				-		meshlet.indices[meshlet.triangle_count][0] = av;
			
 
				-		meshlet.indices[meshlet.triangle_count][1] = bv;
			
 
				-		meshlet.indices[meshlet.triangle_count][2] = cv;
			
 
				-		meshlet.triangle_count++;
			
 
				+		// update aggregated meshlet cone data for scoring subsequent triangles
			
 
				+		meshlet_cone_acc.px += triangles[best_triangle].px;
			
 
				+		meshlet_cone_acc.py += triangles[best_triangle].py;
			
 
				+		meshlet_cone_acc.pz += triangles[best_triangle].pz;
			
 
				+		meshlet_cone_acc.nx += triangles[best_triangle].nx;
			
 
				+		meshlet_cone_acc.ny += triangles[best_triangle].ny;
			
 
				+		meshlet_cone_acc.nz += triangles[best_triangle].nz;
			
 
				+
			
 
				+		emitted_flags[best_triangle] = 1;
			
 
				+	}
			
 
				+
			
 
				+	if (meshlet.triangle_count)
			
 
				+	{
			
 
				+		finishMeshlet(meshlet, meshlet_triangles);
			
 
				+
			
 
				+		meshlets[meshlet_offset++] = meshlet;
			
 
				+	}
			
 
				+
			
 
				+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
			
 
				+	return meshlet_offset;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
			
 
				+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
			
 
				+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
			
 
				+	memset(used, -1, vertex_count);
			
 
				+
			
 
				+	meshopt_Meshlet meshlet = {};
			
 
				+	size_t meshlet_offset = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		// appends triangle to the meshlet and writes previous meshlet to the output if full
			
 
				+		meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
			
 
				 	}
			
 
				 
			
 
				 	if (meshlet.triangle_count)
			
 
				-		destination[offset++] = meshlet;
			
 
				+	{
			
 
				+		finishMeshlet(meshlet, meshlet_triangles);
			
 
				 
			
 
				-	assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
			
 
				+		meshlets[meshlet_offset++] = meshlet;
			
 
				+	}
			
 
				 
			
 
				-	return offset;
			
 
				+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
			
 
				+	return meshlet_offset;
			
 
				 }
			
 
				 
			
 
				 meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
@@ -178,18 +714,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(index_count / 3 <= kMeshletMaxTriangles);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				-	assert(index_count / 3 <= 256);
			
 
				-
			
 
				 	(void)vertex_count;
			
 
				 
			
 
				 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				 
			
 
				 	// compute triangle normals and gather triangle corners
			
 
				-	float normals[256][3];
			
 
				-	float corners[256][3][3];
			
 
				+	float normals[kMeshletMaxTriangles][3];
			
 
				+	float corners[kMeshletMaxTriangles][3][3];
			
 
				 	size_t triangles = 0;
			
 
				 
			
 
				 	for (size_t i = 0; i < index_count; i += 3)
			
@@ -327,25 +862,23 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 
				 	return bounds;
			
 
				 }
			
 
				 
			
 
				-meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				 {
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(triangle_count <= kMeshletMaxTriangles);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				-	unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
			
 
				+	unsigned int indices[kMeshletMaxTriangles * 3];
			
 
				 
			
 
				-	for (size_t i = 0; i < meshlet->triangle_count; ++i)
			
 
				+	for (size_t i = 0; i < triangle_count * 3; ++i)
			
 
				 	{
			
 
				-		unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
			
 
				-		unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
			
 
				-		unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
			
 
				-
			
 
				-		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+		unsigned int index = meshlet_vertices[meshlet_triangles[i]];
			
 
				+		assert(index < vertex_count);
			
 
				 
			
 
				-		indices[i * 3 + 0] = a;
			
 
				-		indices[i * 3 + 1] = b;
			
 
				-		indices[i * 3 + 2] = c;
			
 
				+		indices[i] = index;
			
 
				 	}
			
 
				 
			
 
				-	return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+	return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				 }
			
--- a/ThirdParty/MeshOptimizer/indexcodec.cpp
+++ b/ThirdParty/MeshOptimizer/indexcodec.cpp
@@ -4,14 +4,6 @@
 
				 #include <assert.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				-#ifndef TRACE
			
 
				-#define TRACE 0
			
 
				-#endif
			
 
				-
			
 
				-#if TRACE
			
 
				-#include <stdio.h>
			
 
				-#endif
			
 
				-
			
 
				 // This work is based on:
			
 
				 // Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
			
 
				 // Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
			
@@ -19,6 +11,9 @@ namespace meshopt
 
				 {
			
 
				 
			
 
				 const unsigned char kIndexHeader = 0xe0;
			
 
				+const unsigned char kSequenceHeader = 0xd0;
			
 
				+
			
 
				+static int gEncodeIndexVersion = 1;
			
 
				 
			
 
				 typedef unsigned int VertexFifo[16];
			
 
				 typedef unsigned int EdgeFifo[16][2];
			
@@ -113,7 +108,7 @@ static unsigned int decodeVByte(const unsigned char*& data)
 
				 	for (int i = 0; i < 4; ++i)
			
 
				 	{
			
 
				 		unsigned char group = *data++;
			
 
				-		result |= (group & 127) << shift;
			
 
				+		result |= unsigned(group & 127) << shift;
			
 
				 		shift += 7;
			
 
				 
			
 
				 		if (group < 128)
			
@@ -123,20 +118,16 @@ static unsigned int decodeVByte(const unsigned char*& data)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int next, unsigned int last)
			
 
				+static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last)
			
 
				 {
			
 
				-	(void)next;
			
 
				-
			
 
				 	unsigned int d = index - last;
			
 
				 	unsigned int v = (d << 1) ^ (int(d) >> 31);
			
 
				 
			
 
				 	encodeVByte(data, v);
			
 
				 }
			
 
				 
			
 
				-static unsigned int decodeIndex(const unsigned char*& data, unsigned int next, unsigned int last)
			
 
				+static unsigned int decodeIndex(const unsigned char*& data, unsigned int last)
			
 
				 {
			
 
				-	(void)next;
			
 
				-
			
 
				 	unsigned int v = decodeVByte(data);
			
 
				 	unsigned int d = (v >> 1) ^ -int(v & 1);
			
 
				 
			
@@ -168,38 +159,6 @@ static void writeTriangle(void* destination, size_t offset, size_t index_size, u
 
				 	}
			
 
				 }
			
 
				 
			
 
				-#if TRACE
			
 
				-static size_t sortTop16(unsigned char dest[16], size_t stats[256])
			
 
				-{
			
 
				-	size_t destsize = 0;
			
 
				-
			
 
				-	for (size_t i = 0; i < 256; ++i)
			
 
				-	{
			
 
				-		size_t j = 0;
			
 
				-		for (; j < destsize; ++j)
			
 
				-		{
			
 
				-			if (stats[i] >= stats[dest[j]])
			
 
				-			{
			
 
				-				if (destsize < 16)
			
 
				-					destsize++;
			
 
				-
			
 
				-				memmove(&dest[j + 1], &dest[j], destsize - 1 - j);
			
 
				-				dest[j] = (unsigned char)i;
			
 
				-				break;
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		if (j == destsize && destsize < 16)
			
 
				-		{
			
 
				-			dest[destsize] = (unsigned char)i;
			
 
				-			destsize++;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return destsize;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 } // namespace meshopt
			
 
				 
			
 
				 size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
			
@@ -208,16 +167,13 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				 
			
 
				-#if TRACE
			
 
				-	size_t codestats[256] = {};
			
 
				-	size_t codeauxstats[256] = {};
			
 
				-#endif
			
 
				-
			
 
				 	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
			
 
				 	if (buffer_size < 1 + index_count / 3 + 16)
			
 
				 		return 0;
			
 
				 
			
 
				-	buffer[0] = kIndexHeader;
			
 
				+	int version = gEncodeIndexVersion;
			
 
				+
			
 
				+	buffer[0] = (unsigned char)(kIndexHeader | version);
			
 
				 
			
 
				 	EdgeFifo edgefifo;
			
 
				 	memset(edgefifo, -1, sizeof(edgefifo));
			
@@ -235,6 +191,8 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 
				 	unsigned char* data = code + index_count / 3;
			
 
				 	unsigned char* data_safe_end = buffer + buffer_size - 16;
			
 
				 
			
 
				+	int fecmax = version >= 1 ? 13 : 15;
			
 
				+
			
 
				 	// use static encoding table; it's possible to pack the result and then build an optimal table and repack
			
 
				 	// for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
			
 
				 	const unsigned char* codeaux_table = kCodeAuxEncodingTable;
			
@@ -259,20 +217,25 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 
				 			int fe = fer >> 2;
			
 
				 			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
			
 
				 
			
 
				-			int fec = (fc >= 1 && fc < 15) ? fc : (c == next) ? (next++, 0) : 15;
			
 
				+			int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next) ? (next++, 0) : 15;
			
 
				 
			
 
				-			*code++ = (unsigned char)((fe << 4) | fec);
			
 
				+			if (fec == 15 && version >= 1)
			
 
				+			{
			
 
				+				// encode last-1 and last+1 to optimize strip-like sequences
			
 
				+				if (c + 1 == last)
			
 
				+					fec = 13, last = c;
			
 
				+				if (c == last + 1)
			
 
				+					fec = 14, last = c;
			
 
				+			}
			
 
				 
			
 
				-#if TRACE
			
 
				-			codestats[code[-1]]++;
			
 
				-#endif
			
 
				+			*code++ = (unsigned char)((fe << 4) | fec);
			
 
				 
			
 
				 			// note that we need to update the last index since free indices are delta-encoded
			
 
				 			if (fec == 15)
			
 
				-				encodeIndex(data, c, next, last), last = c;
			
 
				+				encodeIndex(data, c, last), last = c;
			
 
				 
			
 
				 			// we only need to push third vertex since first two are likely already in the vertex fifo
			
 
				-			if (fec == 0 || fec == 15)
			
 
				+			if (fec == 0 || fec >= fecmax)
			
 
				 				pushVertexFifo(vertexfifo, c, vertexfifooffset);
			
 
				 
			
 
				 			// we only need to push two new edges to edge fifo since the third one is already there
			
@@ -286,6 +249,19 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 
				 
			
 
				 			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
			
 
				 
			
 
				+			// if a/b/c are 0/1/2, we emit a reset code
			
 
				+			bool reset = false;
			
 
				+
			
 
				+			if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1)
			
 
				+			{
			
 
				+				reset = true;
			
 
				+				next = 0;
			
 
				+
			
 
				+				// reset vertex fifo to make sure we don't accidentally reference vertices from that in the future
			
 
				+				// this makes sure next continues to get incremented instead of being stuck
			
 
				+				memset(vertexfifo, -1, sizeof(vertexfifo));
			
 
				+			}
			
 
				+
			
 
				 			int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
			
 
				 			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
			
 
				 
			
@@ -299,7 +275,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 
				 			int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
			
 
				 
			
 
				 			// <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
			
 
				-			if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14)
			
 
				+			if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset)
			
 
				 			{
			
 
				 				*code++ = (unsigned char)((15 << 4) | codeauxindex);
			
 
				 			}
			
@@ -309,20 +285,15 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 
				 				*data++ = codeaux;
			
 
				 			}
			
 
				 
			
 
				-#if TRACE
			
 
				-			codestats[code[-1]]++;
			
 
				-			codeauxstats[codeaux]++;
			
 
				-#endif
			
 
				-
			
 
				 			// note that we need to update the last index since free indices are delta-encoded
			
 
				 			if (fea == 15)
			
 
				-				encodeIndex(data, a, next, last), last = a;
			
 
				+				encodeIndex(data, a, last), last = a;
			
 
				 
			
 
				 			if (feb == 15)
			
 
				-				encodeIndex(data, b, next, last), last = b;
			
 
				+				encodeIndex(data, b, last), last = b;
			
 
				 
			
 
				 			if (fec == 15)
			
 
				-				encodeIndex(data, c, next, last), last = c;
			
 
				+				encodeIndex(data, c, last), last = c;
			
 
				 
			
 
				 			// only push vertices that weren't already in fifo
			
 
				 			if (fea == 0 || fea == 15)
			
@@ -356,33 +327,12 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 
				 		*data++ = codeaux_table[i];
			
 
				 	}
			
 
				 
			
 
				+	// since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference
			
 
				+	assert(codeaux_table[0] == 0);
			
 
				+
			
 
				 	assert(data >= buffer + index_count / 3 + 16);
			
 
				 	assert(data <= buffer + buffer_size);
			
 
				 
			
 
				-#if TRACE
			
 
				-	unsigned char codetop[16], codeauxtop[16];
			
 
				-	size_t codetopsize = sortTop16(codetop, codestats);
			
 
				-	size_t codeauxtopsize = sortTop16(codeauxtop, codeauxstats);
			
 
				-
			
 
				-	size_t sumcode = 0, sumcodeaux = 0;
			
 
				-	for (size_t i = 0; i < 256; ++i)
			
 
				-		sumcode += codestats[i], sumcodeaux += codeauxstats[i];
			
 
				-
			
 
				-	size_t acccode = 0, acccodeaux = 0;
			
 
				-
			
 
				-	printf("code\t\t\t\t\tcodeaux\n");
			
 
				-
			
 
				-	for (size_t i = 0; i < codetopsize && i < codeauxtopsize; ++i)
			
 
				-	{
			
 
				-		acccode += codestats[codetop[i]];
			
 
				-		acccodeaux += codeauxstats[codeauxtop[i]];
			
 
				-
			
 
				-		printf("%2d: %02x = %d (%.1f%% ..%.1f%%)\t\t%2d: %02x = %d (%.1f%% ..%.1f%%)\n",
			
 
				-		       int(i), codetop[i], int(codestats[codetop[i]]), double(codestats[codetop[i]]) / double(sumcode) * 100, double(acccode) / double(sumcode) * 100,
			
 
				-		       int(i), codeauxtop[i], int(codeauxstats[codeauxtop[i]]), double(codeauxstats[codeauxtop[i]]) / double(sumcodeaux) * 100, double(acccodeaux) / double(sumcodeaux) * 100);
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				 	return data - buffer;
			
 
				 }
			
 
				 
			
@@ -402,6 +352,13 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
 
				 	return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
			
 
				 }
			
 
				 
			
 
				+void meshopt_encodeIndexVersion(int version)
			
 
				+{
			
 
				+	assert(unsigned(version) <= 1);
			
 
				+
			
 
				+	meshopt::gEncodeIndexVersion = version;
			
 
				+}
			
 
				+
			
 
				 int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
			
 
				 {
			
 
				 	using namespace meshopt;
			
@@ -413,7 +370,11 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 
				 	if (buffer_size < 1 + index_count / 3 + 16)
			
 
				 		return -2;
			
 
				 
			
 
				-	if (buffer[0] != kIndexHeader)
			
 
				+	if ((buffer[0] & 0xf0) != kIndexHeader)
			
 
				+		return -1;
			
 
				+
			
 
				+	int version = buffer[0] & 0x0f;
			
 
				+	if (version > 1)
			
 
				 		return -1;
			
 
				 
			
 
				 	EdgeFifo edgefifo;
			
@@ -428,6 +389,8 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 
				 	unsigned int next = 0;
			
 
				 	unsigned int last = 0;
			
 
				 
			
 
				+	int fecmax = version >= 1 ? 13 : 15;
			
 
				+
			
 
				 	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
			
 
				 	const unsigned char* code = buffer + 1;
			
 
				 	const unsigned char* data = code + index_count / 3;
			
@@ -457,7 +420,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 
				 
			
 
				 			// note: this is the most common path in the entire decoder
			
 
				 			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
			
 
				-			if (fec != 15)
			
 
				+			if (fec < fecmax)
			
 
				 			{
			
 
				 				// fifo reads are wrapped around 16 entry buffer
			
 
				 				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
			
@@ -479,8 +442,9 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 
				 			{
			
 
				 				unsigned int c = 0;
			
 
				 
			
 
				+				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
			
 
				 				// note that we need to update the last index since free indices are delta-encoded
			
 
				-				last = c = decodeIndex(data, next, last);
			
 
				+				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
			
 
				 
			
 
				 				// output triangle
			
 
				 				writeTriangle(destination, i, index_size, a, b, c);
			
@@ -540,6 +504,10 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 
				 				int feb = codeaux >> 4;
			
 
				 				int fec = codeaux & 15;
			
 
				 
			
 
				+				// reset: codeaux is 0 but encoded as not-a-table
			
 
				+				if (codeaux == 0)
			
 
				+					next = 0;
			
 
				+
			
 
				 				// fifo reads are wrapped around 16 entry buffer
			
 
				 				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
			
 
				 				unsigned int a = (fea == 0) ? next++ : 0;
			
@@ -548,13 +516,13 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 
				 
			
 
				 				// note that we need to update the last index since free indices are delta-encoded
			
 
				 				if (fea == 15)
			
 
				-					last = a = decodeIndex(data, next, last);
			
 
				+					last = a = decodeIndex(data, last);
			
 
				 
			
 
				 				if (feb == 15)
			
 
				-					last = b = decodeIndex(data, next, last);
			
 
				+					last = b = decodeIndex(data, last);
			
 
				 
			
 
				 				if (fec == 15)
			
 
				-					last = c = decodeIndex(data, next, last);
			
 
				+					last = c = decodeIndex(data, last);
			
 
				 
			
 
				 				// output triangle
			
 
				 				writeTriangle(destination, i, index_size, a, b, c);
			
@@ -577,3 +545,130 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
			
 
				+	if (buffer_size < 1 + index_count + 4)
			
 
				+		return 0;
			
 
				+
			
 
				+	int version = gEncodeIndexVersion;
			
 
				+
			
 
				+	buffer[0] = (unsigned char)(kSequenceHeader | version);
			
 
				+
			
 
				+	unsigned int last[2] = {};
			
 
				+	unsigned int current = 0;
			
 
				+
			
 
				+	unsigned char* data = buffer + 1;
			
 
				+	unsigned char* data_safe_end = buffer + buffer_size - 4;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		// make sure we have enough data to write
			
 
				+		// each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end
			
 
				+		// after this we can be sure we can write without extra bounds checks
			
 
				+		if (data >= data_safe_end)
			
 
				+			return 0;
			
 
				+
			
 
				+		unsigned int index = indices[i];
			
 
				+
			
 
				+		// this is a heuristic that switches between baselines when the delta grows too large
			
 
				+		// we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index
			
 
				+		// for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily
			
 
				+		int cd = int(index - last[current]);
			
 
				+		current ^= ((cd < 0 ? -cd : cd) >= 30);
			
 
				+
			
 
				+		// encode delta from the last index
			
 
				+		unsigned int d = index - last[current];
			
 
				+		unsigned int v = (d << 1) ^ (int(d) >> 31);
			
 
				+
			
 
				+		// note: low bit encodes the index of the last baseline which will be used for reconstruction
			
 
				+		encodeVByte(data, (v << 1) | current);
			
 
				+
			
 
				+		// update last for the next iteration that uses it
			
 
				+		last[current] = index;
			
 
				+	}
			
 
				+
			
 
				+	// make sure we have enough space to write tail
			
 
				+	if (data > data_safe_end)
			
 
				+		return 0;
			
 
				+
			
 
				+	for (int k = 0; k < 4; ++k)
			
 
				+		*data++ = 0;
			
 
				+
			
 
				+	return data - buffer;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	// compute number of bits required for each index
			
 
				+	unsigned int vertex_bits = 1;
			
 
				+
			
 
				+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
			
 
				+		vertex_bits++;
			
 
				+
			
 
				+	// worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit
			
 
				+	unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7;
			
 
				+
			
 
				+	return 1 + index_count * vertex_groups + 4;
			
 
				+}
			
 
				+
			
 
				+int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
			
 
				+	if (buffer_size < 1 + index_count + 4)
			
 
				+		return -2;
			
 
				+
			
 
				+	if ((buffer[0] & 0xf0) != kSequenceHeader)
			
 
				+		return -1;
			
 
				+
			
 
				+	int version = buffer[0] & 0x0f;
			
 
				+	if (version > 1)
			
 
				+		return -1;
			
 
				+
			
 
				+	const unsigned char* data = buffer + 1;
			
 
				+	const unsigned char* data_safe_end = buffer + buffer_size - 4;
			
 
				+
			
 
				+	unsigned int last[2] = {};
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		// make sure we have enough data to read
			
 
				+		// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
			
 
				+		// after this we can be sure we can read without extra bounds checks
			
 
				+		if (data >= data_safe_end)
			
 
				+			return -2;
			
 
				+
			
 
				+		unsigned int v = decodeVByte(data);
			
 
				+
			
 
				+		// decode the index of the last baseline
			
 
				+		unsigned int current = v & 1;
			
 
				+		v >>= 1;
			
 
				+
			
 
				+		// reconstruct index as a delta
			
 
				+		unsigned int d = (v >> 1) ^ -int(v & 1);
			
 
				+		unsigned int index = last[current] + d;
			
 
				+
			
 
				+		// update last for the next iteration that uses it
			
 
				+		last[current] = index;
			
 
				+
			
 
				+		if (index_size == 2)
			
 
				+		{
			
 
				+			static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			static_cast<unsigned int*>(destination)[i] = index;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// we should've read all data bytes and stopped at the boundary between data and tail
			
 
				+	if (data != data_safe_end)
			
 
				+		return -3;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/ThirdParty/MeshOptimizer/indexgenerator.cpp
+++ b/ThirdParty/MeshOptimizer/indexgenerator.cpp
@@ -4,6 +4,8 @@
 
				 #include <assert.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				+// This work is based on:
			
 
				+// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
			
 
				 namespace meshopt
			
 
				 {
			
 
				 
			
@@ -83,10 +85,49 @@ struct VertexStreamHasher
 
				 	}
			
 
				 };
			
 
				 
			
 
				+struct EdgeHasher
			
 
				+{
			
 
				+	const unsigned int* remap;
			
 
				+
			
 
				+	size_t hash(unsigned long long edge) const
			
 
				+	{
			
 
				+		unsigned int e0 = unsigned(edge >> 32);
			
 
				+		unsigned int e1 = unsigned(edge);
			
 
				+
			
 
				+		unsigned int h1 = remap[e0];
			
 
				+		unsigned int h2 = remap[e1];
			
 
				+
			
 
				+		const unsigned int m = 0x5bd1e995;
			
 
				+
			
 
				+		// MurmurHash64B finalizer
			
 
				+		h1 ^= h2 >> 18;
			
 
				+		h1 *= m;
			
 
				+		h2 ^= h1 >> 22;
			
 
				+		h2 *= m;
			
 
				+		h1 ^= h2 >> 17;
			
 
				+		h1 *= m;
			
 
				+		h2 ^= h1 >> 19;
			
 
				+		h2 *= m;
			
 
				+
			
 
				+		return h2;
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned long long lhs, unsigned long long rhs) const
			
 
				+	{
			
 
				+		unsigned int l0 = unsigned(lhs >> 32);
			
 
				+		unsigned int l1 = unsigned(lhs);
			
 
				+
			
 
				+		unsigned int r0 = unsigned(rhs >> 32);
			
 
				+		unsigned int r1 = unsigned(rhs);
			
 
				+
			
 
				+		return remap[l0] == remap[r0] && remap[l1] == remap[r1];
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				 static size_t hashBuckets(size_t count)
			
 
				 {
			
 
				 	size_t buckets = 1;
			
 
				-	while (buckets < count)
			
 
				+	while (buckets < count + count / 4)
			
 
				 		buckets *= 2;
			
 
				 
			
 
				 	return buckets;
			
@@ -116,7 +157,43 @@ static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, c
 
				 	}
			
 
				 
			
 
				 	assert(false && "Hash table is full"); // unreachable
			
 
				-	return 0;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
			
 
				+{
			
 
				+	VertexHasher vertex_hasher = {reinterpret_cast<const unsigned char*>(vertex_positions), 3 * sizeof(float), vertex_positions_stride};
			
 
				+
			
 
				+	size_t vertex_table_size = hashBuckets(vertex_count);
			
 
				+	unsigned int* vertex_table = allocator.allocate<unsigned int>(vertex_table_size);
			
 
				+	memset(vertex_table, -1, vertex_table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = unsigned(i);
			
 
				+		unsigned int* entry = hashLookup(vertex_table, vertex_table_size, vertex_hasher, index, ~0u);
			
 
				+
			
 
				+		if (*entry == ~0u)
			
 
				+			*entry = index;
			
 
				+
			
 
				+		remap[index] = *entry;
			
 
				+	}
			
 
				+
			
 
				+	allocator.deallocate(vertex_table);
			
 
				+}
			
 
				+
			
 
				+template <size_t BlockSize>
			
 
				+static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
			
 
				+{
			
 
				+	size_t block_size = BlockSize == 0 ? vertex_size : BlockSize;
			
 
				+	assert(block_size == vertex_size);
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		if (remap[i] != ~0u)
			
 
				+		{
			
 
				+			assert(remap[i] < vertex_count);
			
 
				+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * block_size, static_cast<const unsigned char*>(vertices) + i * block_size, block_size);
			
 
				+		}
			
 
				 }
			
 
				 
			
 
				 } // namespace meshopt
			
@@ -126,7 +203,7 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(indices || index_count == vertex_count);
			
 
				-	assert(index_count % 3 == 0);
			
 
				+	assert(!indices || index_count % 3 == 0);
			
 
				 	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
@@ -227,6 +304,8 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne
 
				 
			
 
				 void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
			
 
				 {
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				 	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
@@ -239,14 +318,23 @@ void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t v
 
				 		vertices = vertices_copy;
			
 
				 	}
			
 
				 
			
 
				-	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	// specialize the loop for common vertex sizes to ensure memcpy is compiled as an inlined intrinsic
			
 
				+	switch (vertex_size)
			
 
				 	{
			
 
				-		if (remap[i] != ~0u)
			
 
				-		{
			
 
				-			assert(remap[i] < vertex_count);
			
 
				+	case 4:
			
 
				+		return remapVertices<4>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				 
			
 
				-			memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
			
 
				-		}
			
 
				+	case 8:
			
 
				+		return remapVertices<8>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				+
			
 
				+	case 12:
			
 
				+		return remapVertices<12>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				+
			
 
				+	case 16:
			
 
				+		return remapVertices<16>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				+
			
 
				+	default:
			
 
				+		return remapVertices<0>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -345,3 +433,146 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
 
				 		destination[i] = remap[index];
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	static const int next[4] = {1, 2, 0, 1};
			
 
				+
			
 
				+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
			
 
				+
			
 
				+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
			
 
				+	EdgeHasher edge_hasher = {remap};
			
 
				+
			
 
				+	size_t edge_table_size = hashBuckets(index_count);
			
 
				+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
			
 
				+	unsigned int* edge_vertex_table = allocator.allocate<unsigned int>(edge_table_size);
			
 
				+
			
 
				+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
			
 
				+	memset(edge_vertex_table, -1, edge_table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+			unsigned int i2 = indices[i + next[e + 1]];
			
 
				+			assert(i0 < vertex_count && i1 < vertex_count && i2 < vertex_count);
			
 
				+
			
 
				+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
			
 
				+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
			
 
				+
			
 
				+			if (*entry == ~0ull)
			
 
				+			{
			
 
				+				*entry = edge;
			
 
				+
			
 
				+				// store vertex opposite to the edge
			
 
				+				edge_vertex_table[entry - edge_table] = i2;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// build resulting index buffer: 6 indices for each input triangle
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int patch[6];
			
 
				+
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+			assert(i0 < vertex_count && i1 < vertex_count);
			
 
				+
			
 
				+			// note: this refers to the opposite edge!
			
 
				+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
			
 
				+			unsigned long long* oppe = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
			
 
				+
			
 
				+			patch[e * 2 + 0] = i0;
			
 
				+			patch[e * 2 + 1] = (*oppe == ~0ull) ? i0 : edge_vertex_table[oppe - edge_table];
			
 
				+		}
			
 
				+
			
 
				+		memcpy(destination + i * 2, patch, sizeof(patch));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	static const int next[3] = {1, 2, 0};
			
 
				+
			
 
				+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
			
 
				+
			
 
				+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
			
 
				+	EdgeHasher edge_hasher = {remap};
			
 
				+
			
 
				+	size_t edge_table_size = hashBuckets(index_count);
			
 
				+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
			
 
				+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+			assert(i0 < vertex_count && i1 < vertex_count);
			
 
				+
			
 
				+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
			
 
				+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
			
 
				+
			
 
				+			if (*entry == ~0ull)
			
 
				+				*entry = edge;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// build resulting index buffer: 12 indices for each input triangle
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int patch[12];
			
 
				+
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+			assert(i0 < vertex_count && i1 < vertex_count);
			
 
				+
			
 
				+			// note: this refers to the opposite edge!
			
 
				+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
			
 
				+			unsigned long long oppe = *hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
			
 
				+
			
 
				+			// use the same edge if opposite edge doesn't exist (border)
			
 
				+			oppe = (oppe == ~0ull) ? edge : oppe;
			
 
				+
			
 
				+			// triangle index (0, 1, 2)
			
 
				+			patch[e] = i0;
			
 
				+
			
 
				+			// opposite edge (3, 4; 5, 6; 7, 8)
			
 
				+			patch[3 + e * 2 + 0] = unsigned(oppe);
			
 
				+			patch[3 + e * 2 + 1] = unsigned(oppe >> 32);
			
 
				+
			
 
				+			// dominant vertex (9, 10, 11)
			
 
				+			patch[9 + e] = remap[i0];
			
 
				+		}
			
 
				+
			
 
				+		memcpy(destination + i * 4, patch, sizeof(patch));
			
 
				+	}
			
 
				+}
			
--- a/ThirdParty/MeshOptimizer/meshoptimizer.h
+++ b/ThirdParty/MeshOptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
 
				 /**
			
 
				- * meshoptimizer - version 0.12
			
 
				+ * meshoptimizer - version 0.19
			
 
				  *
			
 
				- * Copyright (C) 2016-2019, by Arseny Kapoulkine ([email protected])
			
 
				+ * Copyright (C) 2016-2023, by Arseny Kapoulkine ([email protected])
			
 
				  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
			
 
				  *
			
 
				  * This library is distributed under the MIT License. See notice at the end of this file.
			
@@ -12,13 +12,22 @@
 
				 #include <stddef.h>
			
 
				 
			
 
				 /* Version macro; major * 1000 + minor * 10 + patch */
			
 
				-#define MESHOPTIMIZER_VERSION 120
			
 
				+#define MESHOPTIMIZER_VERSION 190 /* 0.19 */
			
 
				 
			
 
				 /* If no API is defined, assume default */
			
 
				 #ifndef MESHOPTIMIZER_API
			
 
				 #define MESHOPTIMIZER_API
			
 
				 #endif
			
 
				 
			
 
				+/* Set the calling-convention for alloc/dealloc function pointers */
			
 
				+#ifndef MESHOPTIMIZER_ALLOC_CALLCONV
			
 
				+#ifdef _MSC_VER
			
 
				+#define MESHOPTIMIZER_ALLOC_CALLCONV __cdecl
			
 
				+#else
			
 
				+#define MESHOPTIMIZER_ALLOC_CALLCONV
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				 /* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
			
 
				 #define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
			
 
				 
			
@@ -28,8 +37,8 @@ extern "C" {
 
				 #endif
			
 
				 
			
 
				 /**
			
 
				- * Vertex attribute stream, similar to glVertexPointer
			
 
				- * Each element takes size bytes, with stride controlling the spacing between successive elements.
			
 
				+ * Vertex attribute stream
			
 
				+ * Each element takes size bytes, beginning at data, with stride controlling the spacing between successive elements (stride >= size).
			
 
				  */
			
 
				 struct meshopt_Stream
			
 
				 {
			
@@ -42,6 +51,7 @@ struct meshopt_Stream
 
				  * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
			
 
				  * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
			
 
				  * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
			
 
				+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting remap table (vertex_count elements)
			
 
				  * indices can be NULL if the input is unindexed
			
@@ -53,6 +63,7 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
 
				  * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
			
 
				  * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
			
 
				  * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream.
			
 
				+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting remap table (vertex_count elements)
			
 
				  * indices can be NULL if the input is unindexed
			
@@ -79,6 +90,7 @@ MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const
 
				  * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
			
 
				  * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer.
			
 
				  * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
			
 
				+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				  */
			
@@ -88,11 +100,41 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 
				  * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
			
 
				  * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer.
			
 
				  * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
			
 
				+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
			
 
				 
			
 
				+/**
			
 
				+ * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
			
 
				+ * Each triangle is converted into a 6-vertex patch with the following layout:
			
 
				+ * - 0, 2, 4: original triangle vertices
			
 
				+ * - 1, 3, 5: vertices adjacent to edges 02, 24 and 40
			
 
				+ * The resulting patch can be rendered with geometry shaders using e.g. VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY.
			
 
				+ * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count*2 elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Generate index buffer that can be used for PN-AEN tessellation with crack-free displacement
			
 
				+ * Each triangle is converted into a 12-vertex patch with the following layout:
			
 
				+ * - 0, 1, 2: original triangle vertices
			
 
				+ * - 3, 4: opposing edge for edge 0, 1
			
 
				+ * - 5, 6: opposing edge for edge 1, 2
			
 
				+ * - 7, 8: opposing edge for edge 2, 0
			
 
				+ * - 9, 10, 11: dominant vertices for corners 0, 1, 2
			
 
				+ * The resulting patch can be rendered with hardware tessellation using PN-AEN and displacement mapping.
			
 
				+ * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count*4 elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				 /**
			
 
				  * Vertex transform cache optimizer
			
 
				  * Reorders indices to reduce the number of GPU vertex shader invocations
			
@@ -102,6 +144,15 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* dest
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
			
 
				 
			
 
				+/**
			
 
				+ * Vertex transform cache optimizer for strip-like caches
			
 
				+ * Produces inferior results to meshopt_optimizeVertexCache from the GPU vertex cache perspective
			
 
				+ * However, the resulting index order is more optimal if the goal is to reduce the triangle strip length or improve compression efficiency
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
			
 
				+
			
 
				 /**
			
 
				  * Vertex transform cache optimizer for FIFO caches
			
 
				  * Reorders indices to reduce the number of GPU vertex shader invocations
			
@@ -120,7 +171,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				  * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
			
@@ -149,6 +200,7 @@ MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destinat
 
				 /**
			
 
				  * Index buffer encoder
			
 
				  * Encodes index data into an array of bytes that is generally much smaller (<1.5 bytes/triangle) and compresses better (<1 bytes/triangle) compared to original.
			
 
				+ * Input index buffer must represent a triangle list.
			
 
				  * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
			
 
				  * For maximum efficiency the index buffer being encoded has to be optimized for vertex cache and vertex fetch first.
			
 
				  *
			
@@ -157,6 +209,12 @@ MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destinat
 
				 MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
			
 
				 MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
			
 
				 
			
 
				+/**
			
 
				+ * Set index encoder format version
			
 
				+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
			
 
				+
			
 
				 /**
			
 
				  * Index buffer decoder
			
 
				  * Decodes index data from an array of bytes generated by meshopt_encodeIndexBuffer
			
@@ -167,17 +225,45 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size
 
				  */
			
 
				 MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
			
 
				 
			
 
				+/**
			
 
				+ * Index sequence encoder
			
 
				+ * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
			
 
				+ * Input index sequence can represent arbitrary topology; for triangle lists meshopt_encodeIndexBuffer is likely to be better.
			
 
				+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
			
 
				+ *
			
 
				+ * buffer must contain enough space for the encoded index sequence (use meshopt_encodeIndexSequenceBound to compute worst case size)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Index sequence decoder
			
 
				+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexSequence
			
 
				+ * Returns 0 if decoding was successful, and an error code otherwise
			
 
				+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index sequence (index_count elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
			
 
				+
			
 
				 /**
			
 
				  * Vertex buffer encoder
			
 
				  * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original.
			
 
				  * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
			
 
				  * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
			
 
				+ * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
			
 
				  *
			
 
				  * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
			
 
				  */
			
 
				 MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
			
 
				 MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
			
 
				 
			
 
				+/**
			
 
				+ * Set vertex encoder format version
			
 
				+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
			
 
				+
			
 
				 /**
			
 
				  * Vertex buffer decoder
			
 
				  * Decodes vertex data from an array of bytes generated by meshopt_encodeVertexBuffer
			
@@ -189,7 +275,63 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, si
 
				 MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Mesh simplifier
			
 
				+ * Vertex buffer filters
			
 
				+ * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
			
 
				+ *
			
 
				+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
			
 
				+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
			
 
				+ *
			
 
				+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
			
 
				+ * Each component is stored as an 16-bit integer; stride must be equal to 8.
			
 
				+ *
			
 
				+ * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
			
 
				+ * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex buffer filter encoders
			
 
				+ * These functions can be used to encode data in a format that meshopt_decodeFilter can decode
			
 
				+ *
			
 
				+ * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output.
			
 
				+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
			
 
				+ * Input data must contain 4 floats for every vector (count*4 total).
			
 
				+ *
			
 
				+ * meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding.
			
 
				+ * Each component is stored as an 16-bit integer; stride must be equal to 8.
			
 
				+ * Input data must contain 4 floats for every quaternion (count*4 total).
			
 
				+ *
			
 
				+ * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
			
 
				+ * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
			
 
				+ * Input data must contain stride/4 floats for every vector (count*stride/4 total).
			
 
				+ */
			
 
				+enum meshopt_EncodeExpMode
			
 
				+{
			
 
				+    /* When encoding exponents, use separate values for each component (maximum quality) */
			
 
				+    meshopt_EncodeExpSeparate,
			
 
				+    /* When encoding exponents, use shared value for all components of each vector (better compression) */
			
 
				+    meshopt_EncodeExpSharedVector,
			
 
				+    /* When encoding exponents, use shared value for each component of all vectors (best compression) */
			
 
				+    meshopt_EncodeExpSharedComponent,
			
 
				+};
			
 
				+
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
			
 
				+
			
 
				+/**
			
 
				+ * Simplification options
			
 
				+ */
			
 
				+enum
			
 
				+{
			
 
				+    /* Do not move vertices that are located on the topological border (vertices on triangle edges that don't have a paired triangle). Useful for simplifying portions of the larger mesh. */
			
 
				+    meshopt_SimplifyLockBorder = 1 << 0,
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Mesh simplifier
			
 
				  * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
			
 
				  * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
			
 
				  * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
			
@@ -197,23 +339,39 @@ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t verte
 
				  * The resulting index buffer references vertices from the original vertex buffer.
			
 
				  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				- * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!)
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
			
 
				+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
			
 
				+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
			
 
				+MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				+
			
 
				+/**
			
 
				+ * Experimental: Mesh simplifier with attribute metric
			
 
				+ * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
			
 
				+ * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
			
 
				+ *
			
 
				+ * vertex_attributes should have attribute_count floats for each vertex
			
 
				+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
			
 
				+ * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				 
			
 
				 /**
			
 
				  * Experimental: Mesh simplifier (sloppy)
			
 
				- * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance
			
 
				- * The algorithm doesn't preserve mesh topology but is always able to reach target triangle count.
			
 
				+ * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
			
 
				+ * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
			
 
				  * Returns the number of indices after simplification, with destination containing new index data
			
 
				  * The resulting index buffer references vertices from the original vertex buffer.
			
 
				  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				- * destination must contain enough space for the target index buffer
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
			
 
				+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
			
 
				 
			
 
				 /**
			
 
				  * Experimental: Point cloud simplifier
			
@@ -222,10 +380,19 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati
 
				  * The resulting index buffer references vertices from the original vertex buffer.
			
 
				  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				- * destination must contain enough space for the target index buffer
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * destination must contain enough space for the target index buffer (target_vertex_count elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
			
 
				+ *
			
 
				+ * Absolute error must be *divided* by the scaling factor before passing it to meshopt_simplify as target_error
			
 
				+ * Relative error returned by meshopt_simplify via result_error must be *multiplied* by the scaling factor to get absolute error.
			
 
				+ */
			
 
				+MESHOPTIMIZER_API float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
 
				 /**
			
 
				  * Mesh stripifier
			
@@ -277,7 +444,7 @@ struct meshopt_OverdrawStatistics
 
				  * Returns overdraw statistics using a software rasterizer
			
 
				  * Results may not match actual GPU performance
			
 
				  *
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				 MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
@@ -296,23 +463,32 @@ MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetc
 
				 
			
 
				 struct meshopt_Meshlet
			
 
				 {
			
 
				-	unsigned int vertices[64];
			
 
				-	unsigned char indices[126][3];
			
 
				-	unsigned char triangle_count;
			
 
				-	unsigned char vertex_count;
			
 
				+	/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
			
 
				+	unsigned int vertex_offset;
			
 
				+	unsigned int triangle_offset;
			
 
				+
			
 
				+	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
			
 
				+	unsigned int vertex_count;
			
 
				+	unsigned int triangle_count;
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Meshlet builder
			
 
				+ * Meshlet builder
			
 
				  * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
			
 
				  * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
			
 
				- * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
			
 
				+ * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
			
 
				+ * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
			
 
				  *
			
 
				- * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
			
 
				- * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
			
 
				+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
			
 
				+ * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
			
 
				+ * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512)
			
 
				+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
			
 
				+MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
			
 
				+MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
			
 
				+MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
			
 
				 
			
 
				 struct meshopt_Bounds
			
 
				 {
			
@@ -331,13 +507,13 @@ struct meshopt_Bounds
 
				 };
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Cluster bounds generator
			
 
				+ * Cluster bounds generator
			
 
				  * Creates bounding volumes that can be used for frustum, backface and occlusion culling.
			
 
				  *
			
 
				  * For backface culling with orthographic projection, use the following formula to reject backfacing clusters:
			
 
				  *   dot(view, cone_axis) >= cone_cutoff
			
 
				  *
			
 
				- * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff:
			
 
				+ * For perspective projection, you can use the formula that needs cone apex in addition to axis & cutoff:
			
 
				  *   dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
			
 
				  *
			
 
				  * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead:
			
@@ -346,30 +522,31 @@ struct meshopt_Bounds
 
				  *   dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius
			
 
				  *
			
 
				  * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
			
 
				- * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
			
 
				+ * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable (for derivation see
			
 
				+ * Real-Time Rendering 4th Edition, section 19.3).
			
 
				  *
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				- * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Spatial sorter
			
 
				+ * Spatial sorter
			
 
				  * Generates a remap table that can be used to reorder points for spatial locality.
			
 
				  * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting remap table (vertex_count elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
 
				 /**
			
 
				  * Experimental: Spatial sorter
			
 
				  * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				- * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				 MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
@@ -379,7 +556,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti
 
				  * Note that all algorithms only allocate memory for temporary use.
			
 
				  * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
			
 
				  */
			
 
				-MESHOPTIMIZER_API void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*));
			
 
				+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*));
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 } /* extern "C" */
			
@@ -402,19 +579,25 @@ inline int meshopt_quantizeUnorm(float v, int N);
 
				 inline int meshopt_quantizeSnorm(float v, int N);
			
 
				 
			
 
				 /**
			
 
				- * Quantize a float into half-precision floating point value
			
 
				+ * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
			
 
				  * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
			
 
				  * Representable magnitude range: [6e-5; 65504]
			
 
				  * Maximum relative reconstruction error: 5e-4
			
 
				  */
			
 
				-inline unsigned short meshopt_quantizeHalf(float v);
			
 
				+MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
			
 
				 
			
 
				 /**
			
 
				- * Quantize a float into a floating point value with a limited number of significant mantissa bits
			
 
				+ * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
			
 
				  * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
			
 
				  * Assumes N is in a valid mantissa precision range, which is 1..23
			
 
				  */
			
 
				-inline float meshopt_quantizeFloat(float v, int N);
			
 
				+MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
			
 
				+
			
 
				+/**
			
 
				+ * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
			
 
				+ * Preserves Inf/NaN, flushes denormals to zero
			
 
				+ */
			
 
				+MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
			
 
				 #endif
			
 
				 
			
 
				 /**
			
@@ -437,8 +620,14 @@ inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices,
 
				 template <typename T>
			
 
				 inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
			
 
				 template <typename T>
			
 
				+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+template <typename T>
			
 
				 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
			
 
				 template <typename T>
			
 
				+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
			
 
				+template <typename T>
			
 
				 inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
			
 
				 template <typename T>
			
 
				 inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
			
@@ -451,9 +640,15 @@ inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_siz
 
				 template <typename T>
			
 
				 inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
			
 
				 template <typename T>
			
 
				-inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
			
 
				+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
			
 
				+template <typename T>
			
 
				+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
			
 
				 template <typename T>
			
 
				-inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
			
 
				+inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
			
 
				 template <typename T>
			
 
				 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
			
 
				 template <typename T>
			
@@ -465,7 +660,9 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
 
				 template <typename T>
			
 
				 inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
			
 
				 template <typename T>
			
 
				-inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
			
 
				+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
			
 
				 template <typename T>
			
 
				 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 template <typename T>
			
@@ -495,50 +692,6 @@ inline int meshopt_quantizeSnorm(float v, int N)
 
				 
			
 
				 	return int(v * scale + round);
			
 
				 }
			
 
				-
			
 
				-inline unsigned short meshopt_quantizeHalf(float v)
			
 
				-{
			
 
				-	union { float f; unsigned int ui; } u = {v};
			
 
				-	unsigned int ui = u.ui;
			
 
				-
			
 
				-	int s = (ui >> 16) & 0x8000;
			
 
				-	int em = ui & 0x7fffffff;
			
 
				-
			
 
				-	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
			
 
				-	int h = (em - (112 << 23) + (1 << 12)) >> 13;
			
 
				-
			
 
				-	/* underflow: flush to zero; 113 encodes exponent -14 */
			
 
				-	h = (em < (113 << 23)) ? 0 : h;
			
 
				-
			
 
				-	/* overflow: infinity; 143 encodes exponent 16 */
			
 
				-	h = (em >= (143 << 23)) ? 0x7c00 : h;
			
 
				-
			
 
				-	/* NaN; note that we convert all types of NaN to qNaN */
			
 
				-	h = (em > (255 << 23)) ? 0x7e00 : h;
			
 
				-
			
 
				-	return (unsigned short)(s | h);
			
 
				-}
			
 
				-
			
 
				-inline float meshopt_quantizeFloat(float v, int N)
			
 
				-{
			
 
				-	union { float f; unsigned int ui; } u = {v};
			
 
				-	unsigned int ui = u.ui;
			
 
				-
			
 
				-	const int mask = (1 << (23 - N)) - 1;
			
 
				-	const int round = (1 << (23 - N)) >> 1;
			
 
				-
			
 
				-	int e = ui & 0x7f800000;
			
 
				-	unsigned int rui = (ui + round) & ~mask;
			
 
				-
			
 
				-	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
			
 
				-	ui = e == 0x7f800000 ? ui : rui;
			
 
				-
			
 
				-	/* flush denormals to zero */
			
 
				-	ui = e == 0 ? 0 : ui;
			
 
				-
			
 
				-	u.ui = ui;
			
 
				-	return u.f;
			
 
				-}
			
 
				 #endif
			
 
				 
			
 
				 /* Internal implementation helpers */
			
@@ -549,8 +702,8 @@ public:
 
				 	template <typename T>
			
 
				 	struct StorageT
			
 
				 	{
			
 
				-		static void* (*allocate)(size_t);
			
 
				-		static void (*deallocate)(void*);
			
 
				+		static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t);
			
 
				+		static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*);
			
 
				 	};
			
 
				 
			
 
				 	typedef StorageT<void> Storage;
			
@@ -575,14 +728,21 @@ public:
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				+	void deallocate(void* ptr)
			
 
				+	{
			
 
				+		assert(count > 0 && blocks[count - 1] == ptr);
			
 
				+		Storage::deallocate(ptr);
			
 
				+		count--;
			
 
				+	}
			
 
				+
			
 
				 private:
			
 
				-	void* blocks[16];
			
 
				+	void* blocks[24];
			
 
				 	size_t count;
			
 
				 };
			
 
				 
			
 
				 // This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
			
 
				-template <typename T> void* (*meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
			
 
				-template <typename T> void (*meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
			
 
				+template <typename T> void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
			
 
				+template <typename T> void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
			
 
				 #endif
			
 
				 
			
 
				 /* Inline implementation for C++ templated wrappers */
			
@@ -599,7 +759,7 @@ struct meshopt_IndexAdapter<T, false>
 
				 
			
 
				 	meshopt_IndexAdapter(T* result_, const T* input, size_t count_)
			
 
				 	    : result(result_)
			
 
				-	    , data(0)
			
 
				+	    , data(NULL)
			
 
				 	    , count(count_)
			
 
				 	{
			
 
				 		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
			
@@ -639,33 +799,33 @@ struct meshopt_IndexAdapter<T, true>
 
				 template <typename T>
			
 
				 inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
			
 
				 
			
 
				-	return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size);
			
 
				+	return meshopt_generateVertexRemap(destination, indices ? in.data : NULL, index_count, vertices, vertex_count, vertex_size);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				 inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
			
 
				 
			
 
				-	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count);
			
 
				+	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				 inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
			
 
				 	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				 
			
 
				-	meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap);
			
 
				+	meshopt_remapIndexBuffer(out.data, indices ? in.data : NULL, index_count, remap);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				 inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				 
			
 
				 	meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride);
			
 
				 }
			
@@ -673,26 +833,53 @@ inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices,
 
				 template <typename T>
			
 
				 inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				 
			
 
				 	meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count);
			
 
				 }
			
 
				 
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count * 2);
			
 
				+
			
 
				+	meshopt_generateAdjacencyIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count * 4);
			
 
				+
			
 
				+	meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+}
			
 
				+
			
 
				 template <typename T>
			
 
				 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				 
			
 
				 	meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count);
			
 
				 }
			
 
				 
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count);
			
 
				+}
			
 
				+
			
 
				 template <typename T>
			
 
				 inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				 
			
 
				 	meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size);
			
 
				 }
			
@@ -700,8 +887,8 @@ inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, si
 
				 template <typename T>
			
 
				 inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				 
			
 
				 	meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold);
			
 
				 }
			
@@ -709,7 +896,7 @@ inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t in
 
				 template <typename T>
			
 
				 inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 
			
 
				 	return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count);
			
 
				 }
			
@@ -725,7 +912,7 @@ inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t
 
				 template <typename T>
			
 
				 inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 
			
 
				 	return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count);
			
 
				 }
			
@@ -740,28 +927,54 @@ inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const u
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				-inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
			
 
				+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
			
 
				+	(void)index_size_valid;
			
 
				 
			
 
				-	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error);
			
 
				+	return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				-inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
			
 
				+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, target_index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				 
			
 
				-	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count);
			
 
				+	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, options, result_error);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* result_error)
			
 
				+{
			
 
				+    meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+    meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+    return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, target_index_count, target_error, options, result_error);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, (index_count / 3) * 5);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, (index_count / 3) * 5);
			
 
				 
			
 
				 	return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index));
			
 
				 }
			
@@ -769,8 +982,8 @@ inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_co
 
				 template <typename T>
			
 
				 inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, (index_count - 2) * 3);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, (index_count - 2) * 3);
			
 
				 
			
 
				 	return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index));
			
 
				 }
			
@@ -778,7 +991,7 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_
 
				 template <typename T>
			
 
				 inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 
			
 
				 	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
			
 
				 }
			
@@ -786,7 +999,7 @@ inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices
 
				 template <typename T>
			
 
				 inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 
			
 
				 	return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				 }
			
@@ -794,23 +1007,31 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
 
				 template <typename T>
			
 
				 inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 
			
 
				 	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				-inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
			
 
				+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_buildMeshlets(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, cone_weight);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 
			
 
				-	return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
			
 
				+	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
			
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				 
			
 
				 	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				 }
			
@@ -818,15 +1039,15 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde
 
				 template <typename T>
			
 
				 inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				 {
			
 
				-	meshopt_IndexAdapter<T> in(0, indices, index_count);
			
 
				-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				 
			
 
				 	meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				 /**
			
 
				- * Copyright (c) 2016-2019 Arseny Kapoulkine
			
 
				+ * Copyright (c) 2016-2023 Arseny Kapoulkine
			
 
				  *
			
 
				  * Permission is hereby granted, free of charge, to any person
			
 
				  * obtaining a copy of this software and associated documentation
			
--- a/ThirdParty/MeshOptimizer/overdrawanalyzer.cpp
+++ b/ThirdParty/MeshOptimizer/overdrawanalyzer.cpp
@@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
--- a/ThirdParty/MeshOptimizer/overdrawoptimizer.cpp
+++ b/ThirdParty/MeshOptimizer/overdrawoptimizer.cpp
@@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
--- a/ThirdParty/MeshOptimizer/quantization.cpp
+++ b/ThirdParty/MeshOptimizer/quantization.cpp
@@ -0,0 +1,70 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+
			
 
				+unsigned short meshopt_quantizeHalf(float v)
			
 
				+{
			
 
				+	union { float f; unsigned int ui; } u = {v};
			
 
				+	unsigned int ui = u.ui;
			
 
				+
			
 
				+	int s = (ui >> 16) & 0x8000;
			
 
				+	int em = ui & 0x7fffffff;
			
 
				+
			
 
				+	// bias exponent and round to nearest; 112 is relative exponent bias (127-15)
			
 
				+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
			
 
				+
			
 
				+	// underflow: flush to zero; 113 encodes exponent -14
			
 
				+	h = (em < (113 << 23)) ? 0 : h;
			
 
				+
			
 
				+	// overflow: infinity; 143 encodes exponent 16
			
 
				+	h = (em >= (143 << 23)) ? 0x7c00 : h;
			
 
				+
			
 
				+	// NaN; note that we convert all types of NaN to qNaN
			
 
				+	h = (em > (255 << 23)) ? 0x7e00 : h;
			
 
				+
			
 
				+	return (unsigned short)(s | h);
			
 
				+}
			
 
				+
			
 
				+float meshopt_quantizeFloat(float v, int N)
			
 
				+{
			
 
				+	assert(N >= 0 && N <= 23);
			
 
				+
			
 
				+	union { float f; unsigned int ui; } u = {v};
			
 
				+	unsigned int ui = u.ui;
			
 
				+
			
 
				+	const int mask = (1 << (23 - N)) - 1;
			
 
				+	const int round = (1 << (23 - N)) >> 1;
			
 
				+
			
 
				+	int e = ui & 0x7f800000;
			
 
				+	unsigned int rui = (ui + round) & ~mask;
			
 
				+
			
 
				+	// round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0
			
 
				+	ui = e == 0x7f800000 ? ui : rui;
			
 
				+
			
 
				+	// flush denormals to zero
			
 
				+	ui = e == 0 ? 0 : ui;
			
 
				+
			
 
				+	u.ui = ui;
			
 
				+	return u.f;
			
 
				+}
			
 
				+
			
 
				+float meshopt_dequantizeHalf(unsigned short h)
			
 
				+{
			
 
				+	unsigned int s = unsigned(h & 0x8000) << 16;
			
 
				+	int em = h & 0x7fff;
			
 
				+
			
 
				+	// bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
			
 
				+	int r = (em + (112 << 10)) << 13;
			
 
				+
			
 
				+	// denormal: flush to zero
			
 
				+	r = (em < (1 << 10)) ? 0 : r;
			
 
				+
			
 
				+	// infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
			
 
				+	// 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
			
 
				+	r += (em >= (31 << 10)) ? (112 << 23) : 0;
			
 
				+
			
 
				+	union { float f; unsigned int ui; } u;
			
 
				+	u.ui = s | r;
			
 
				+	return u.f;
			
 
				+}
			
--- a/ThirdParty/MeshOptimizer/simplifier.cpp
+++ b/ThirdParty/MeshOptimizer/simplifier.cpp
--- a/ThirdParty/MeshOptimizer/spatialorder.cpp
+++ b/ThirdParty/MeshOptimizer/spatialorder.cpp
@@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
 
				 {
			
 
				 	using namespace meshopt;
			
 
				 
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
@@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	(void)vertex_count;
			
--- a/ThirdParty/MeshOptimizer/stripifier.cpp
+++ b/ThirdParty/MeshOptimizer/stripifier.cpp
@@ -1,3 +1,4 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				 #include "meshoptimizer.h"
			
 
				 
			
 
				 #include <assert.h>
			
--- a/ThirdParty/MeshOptimizer/vcacheoptimizer.cpp
+++ b/ThirdParty/MeshOptimizer/vcacheoptimizer.cpp
@@ -13,13 +13,23 @@ namespace meshopt
 
				 const size_t kCacheSizeMax = 16;
			
 
				 const size_t kValenceMax = 8;
			
 
				 
			
 
				-static const float kVertexScoreTableCache[1 + kCacheSizeMax] = {
			
 
				-    0.f,
			
 
				-    0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f};
			
 
				+struct VertexScoreTable
			
 
				+{
			
 
				+	float cache[1 + kCacheSizeMax];
			
 
				+	float live[1 + kValenceMax];
			
 
				+};
			
 
				 
			
 
				-static const float kVertexScoreTableLive[1 + kValenceMax] = {
			
 
				-    0.f,
			
 
				-    0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f};
			
 
				+// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
			
 
				+static const VertexScoreTable kVertexScoreTable = {
			
 
				+    {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
			
 
				+    {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
			
 
				+};
			
 
				+
			
 
				+// Tuned to minimize the encoded index buffer size
			
 
				+static const VertexScoreTable kVertexScoreTableStrip = {
			
 
				+    {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
			
 
				+    {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
			
 
				+};
			
 
				 
			
 
				 struct TriangleAdjacency
			
 
				 {
			
@@ -100,7 +110,7 @@ static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned
 
				 	return ~0u;
			
 
				 }
			
 
				 
			
 
				-static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
			
 
				+static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
			
 
				 {
			
 
				 	unsigned int best_candidate = ~0u;
			
 
				 	int best_priority = -1;
			
@@ -131,13 +141,13 @@ static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_b
 
				 	return best_candidate;
			
 
				 }
			
 
				 
			
 
				-static float vertexScore(int cache_position, unsigned int live_triangles)
			
 
				+static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
			
 
				 {
			
 
				 	assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
			
 
				 
			
 
				 	unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
			
 
				 
			
 
				-	return kVertexScoreTableCache[1 + cache_position] + kVertexScoreTableLive[live_triangles_clamped];
			
 
				+	return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
			
 
				 }
			
 
				 
			
 
				 static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
			
@@ -156,7 +166,7 @@ static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const uns
 
				 
			
 
				 } // namespace meshopt
			
 
				 
			
 
				-void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
			
 
				+void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
			
 
				 {
			
 
				 	using namespace meshopt;
			
 
				 
			
@@ -197,7 +207,7 @@ void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int*
 
				 	float* vertex_scores = allocator.allocate<float>(vertex_count);
			
 
				 
			
 
				 	for (size_t i = 0; i < vertex_count; ++i)
			
 
				-		vertex_scores[i] = vertexScore(-1, live_triangles[i]);
			
 
				+		vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
			
 
				 
			
 
				 	// compute triangle scores
			
 
				 	float* triangle_scores = allocator.allocate<float>(face_count);
			
@@ -211,9 +221,9 @@ void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int*
 
				 		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
			
 
				 	}
			
 
				 
			
 
				-	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
			
 
				+	unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
			
 
				 	unsigned int* cache = cache_holder;
			
 
				-	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
			
 
				+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
			
 
				 	size_t cache_count = 0;
			
 
				 
			
 
				 	unsigned int current_triangle = 0;
			
@@ -250,10 +260,8 @@ void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int*
 
				 		{
			
 
				 			unsigned int index = cache[i];
			
 
				 
			
 
				-			if (index != a && index != b && index != c)
			
 
				-			{
			
 
				-				cache_new[cache_write++] = index;
			
 
				-			}
			
 
				+			cache_new[cache_write] = index;
			
 
				+			cache_write += (index != a && index != b && index != c);
			
 
				 		}
			
 
				 
			
 
				 		unsigned int* cache_temp = cache;
			
@@ -271,16 +279,16 @@ void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int*
 
				 		{
			
 
				 			unsigned int index = indices[current_triangle * 3 + k];
			
 
				 
			
 
				-			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
			
 
				-			size_t neighbours_size = adjacency.counts[index];
			
 
				+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
			
 
				+			size_t neighbors_size = adjacency.counts[index];
			
 
				 
			
 
				-			for (size_t i = 0; i < neighbours_size; ++i)
			
 
				+			for (size_t i = 0; i < neighbors_size; ++i)
			
 
				 			{
			
 
				-				unsigned int tri = neighbours[i];
			
 
				+				unsigned int tri = neighbors[i];
			
 
				 
			
 
				 				if (tri == current_triangle)
			
 
				 				{
			
 
				-					neighbours[i] = neighbours[neighbours_size - 1];
			
 
				+					neighbors[i] = neighbors[neighbors_size - 1];
			
 
				 					adjacency.counts[index]--;
			
 
				 					break;
			
 
				 				}
			
@@ -295,19 +303,23 @@ void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int*
 
				 		{
			
 
				 			unsigned int index = cache[i];
			
 
				 
			
 
				+			// no need to update scores if we are never going to use this vertex
			
 
				+			if (adjacency.counts[index] == 0)
			
 
				+				continue;
			
 
				+
			
 
				 			int cache_position = i >= cache_size ? -1 : int(i);
			
 
				 
			
 
				 			// update vertex score
			
 
				-			float score = vertexScore(cache_position, live_triangles[index]);
			
 
				+			float score = vertexScore(table, cache_position, live_triangles[index]);
			
 
				 			float score_diff = score - vertex_scores[index];
			
 
				 
			
 
				 			vertex_scores[index] = score;
			
 
				 
			
 
				 			// update scores of vertex triangles
			
 
				-			const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
			
 
				-			const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
			
 
				+			const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
			
 
				+			const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
			
 
				 
			
 
				-			for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
			
 
				+			for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
			
 
				 			{
			
 
				 				unsigned int tri = *it;
			
 
				 				assert(!emitted_flags[tri]);
			
@@ -315,11 +327,8 @@ void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int*
 
				 				float tri_score = triangle_scores[tri] + score_diff;
			
 
				 				assert(tri_score > 0);
			
 
				 
			
 
				-				if (best_score < tri_score)
			
 
				-				{
			
 
				-					best_triangle = tri;
			
 
				-					best_score = tri_score;
			
 
				-				}
			
 
				+				best_triangle = best_score < tri_score ? tri : best_triangle;
			
 
				+				best_score = best_score < tri_score ? tri_score : best_score;
			
 
				 
			
 
				 				triangle_scores[tri] = tri_score;
			
 
				 			}
			
@@ -338,6 +347,16 @@ void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int*
 
				 	assert(output_triangle == face_count);
			
 
				 }
			
 
				 
			
 
				+void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
			
 
				+}
			
 
				+
			
 
				+void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
			
 
				+}
			
 
				+
			
 
				 void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
			
 
				 {
			
 
				 	using namespace meshopt;
			
@@ -392,11 +411,11 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i
 
				 	{
			
 
				 		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
			
 
				 
			
 
				-		// emit all vertex neighbours
			
 
				-		const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
			
 
				-		const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
			
 
				+		// emit all vertex neighbors
			
 
				+		const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
			
 
				+		const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
			
 
				 
			
 
				-		for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
			
 
				+		for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
			
 
				 		{
			
 
				 			unsigned int triangle = *it;
			
 
				 
			
@@ -441,7 +460,7 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i
 
				 		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
			
 
				 
			
 
				 		// get next vertex
			
 
				-		current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
			
 
				+		current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
			
 
				 
			
 
				 		if (current_vertex == ~0u)
			
 
				 		{
			
--- a/ThirdParty/MeshOptimizer/vertexcodec.cpp
+++ b/ThirdParty/MeshOptimizer/vertexcodec.cpp
@@ -4,28 +4,80 @@
 
				 #include <assert.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
			
 
				-#define SIMD_NEON
			
 
				-#endif
			
 
				+// The block below auto-detects SIMD ISA that can be used on the target platform
			
 
				+#ifndef MESHOPTIMIZER_NO_SIMD
			
 
				 
			
 
				+// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
			
 
				 #if defined(__AVX__) || defined(__SSSE3__)
			
 
				 #define SIMD_SSE
			
 
				 #endif
			
 
				 
			
 
				-#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
			
 
				+// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
			
 
				+#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
			
 
				+#undef SIMD_SSE
			
 
				+#define SIMD_AVX
			
 
				+#endif
			
 
				+
			
 
				+// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
			
 
				+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
			
 
				+#define SIMD_SSE
			
 
				+#define SIMD_FALLBACK
			
 
				+#endif
			
 
				+
			
 
				+// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
			
 
				+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
			
 
				 #define SIMD_SSE
			
 
				 #define SIMD_FALLBACK
			
 
				-#include <intrin.h> // __cpuid
			
 
				+#define SIMD_TARGET __attribute__((target("ssse3")))
			
 
				 #endif
			
 
				 
			
 
				+// GCC/clang define these when NEON support is available
			
 
				+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// On MSVC, we assume that ARM builds always target NEON-capable devices
			
 
				 #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
			
 
				 #define SIMD_NEON
			
 
				 #endif
			
 
				 
			
 
				+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
			
 
				+#if defined(__wasm_simd128__)
			
 
				+#define SIMD_WASM
			
 
				+// Prevent compiling other variant when wasm simd compilation is active
			
 
				+#undef SIMD_NEON
			
 
				+#undef SIMD_SSE
			
 
				+#undef SIMD_AVX
			
 
				+#endif
			
 
				+
			
 
				+#ifndef SIMD_TARGET
			
 
				+#define SIMD_TARGET
			
 
				+#endif
			
 
				+
			
 
				+// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
			
 
				+// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
			
 
				+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
			
 
				+#define SIMD_LATENCYOPT
			
 
				+#endif
			
 
				+
			
 
				+#endif // !MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				 #ifdef SIMD_SSE
			
 
				 #include <tmmintrin.h>
			
 
				 #endif
			
 
				 
			
 
				+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
			
 
				+#ifdef _MSC_VER
			
 
				+#include <intrin.h> // __cpuid
			
 
				+#else
			
 
				+#include <cpuid.h> // __cpuid
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_AVX
			
 
				+#include <immintrin.h>
			
 
				+#endif
			
 
				+
			
 
				 #ifdef SIMD_NEON
			
 
				 #if defined(_MSC_VER) && defined(_M_ARM64)
			
 
				 #include <arm64_neon.h>
			
@@ -34,12 +86,18 @@
 
				 #endif
			
 
				 #endif
			
 
				 
			
 
				-#ifndef TRACE
			
 
				-#define TRACE 0
			
 
				+#ifdef SIMD_WASM
			
 
				+#include <wasm_simd128.h>
			
 
				 #endif
			
 
				 
			
 
				-#if TRACE
			
 
				-#include <stdio.h>
			
 
				+#ifdef SIMD_WASM
			
 
				+#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
			
 
				+#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
			
 
				+#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
			
 
				+#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
			
 
				+#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
			
 
				+#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
			
 
				+#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
			
 
				 #endif
			
 
				 
			
 
				 namespace meshopt
			
@@ -47,9 +105,12 @@ namespace meshopt
 
				 
			
 
				 const unsigned char kVertexHeader = 0xa0;
			
 
				 
			
 
				+static int gEncodeVertexVersion = 0;
			
 
				+
			
 
				 const size_t kVertexBlockSizeBytes = 8192;
			
 
				 const size_t kVertexBlockMaxSize = 256;
			
 
				 const size_t kByteGroupSize = 16;
			
 
				+const size_t kByteGroupDecodeLimit = 24;
			
 
				 const size_t kTailMaxSize = 32;
			
 
				 
			
 
				 static size_t getVertexBlockSize(size_t vertex_size)
			
@@ -74,19 +135,6 @@ inline unsigned char unzigzag8(unsigned char v)
 
				 	return -(v & 1) ^ (v >> 1);
			
 
				 }
			
 
				 
			
 
				-#if TRACE
			
 
				-struct Stats
			
 
				-{
			
 
				-	size_t size;
			
 
				-	size_t header;
			
 
				-	size_t bitg[4];
			
 
				-	size_t bitb[4];
			
 
				-};
			
 
				-
			
 
				-Stats* bytestats;
			
 
				-Stats vertexstats[256];
			
 
				-#endif
			
 
				-
			
 
				 static bool encodeBytesGroupZero(const unsigned char* buffer)
			
 
				 {
			
 
				 	for (size_t i = 0; i < kByteGroupSize; ++i)
			
@@ -172,7 +220,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 
				 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
			
 
				 
			
 
				 	if (size_t(data_end - data) < header_size)
			
 
				-		return 0;
			
 
				+		return NULL;
			
 
				 
			
 
				 	data += header_size;
			
 
				 
			
@@ -180,8 +228,8 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 
				 
			
 
				 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
			
 
				 	{
			
 
				-		if (size_t(data_end - data) < kTailMaxSize)
			
 
				-			return 0;
			
 
				+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
			
 
				+			return NULL;
			
 
				 
			
 
				 		int best_bits = 8;
			
 
				 		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
			
@@ -208,17 +256,8 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 
				 
			
 
				 		assert(data + best_size == next);
			
 
				 		data = next;
			
 
				-
			
 
				-#if TRACE > 1
			
 
				-		bytestats->bitg[bitslog2]++;
			
 
				-		bytestats->bitb[bitslog2] += best_size;
			
 
				-#endif
			
 
				 	}
			
 
				 
			
 
				-#if TRACE > 1
			
 
				-	bytestats->header += header_size;
			
 
				-#endif
			
 
				-
			
 
				 	return data;
			
 
				 }
			
 
				 
			
@@ -247,19 +286,9 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 
				 			vertex_offset += vertex_size;
			
 
				 		}
			
 
				 
			
 
				-#if TRACE
			
 
				-		const unsigned char* olddata = data;
			
 
				-		bytestats = &vertexstats[k];
			
 
				-#endif
			
 
				-
			
 
				 		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
			
 
				 		if (!data)
			
 
				-			return 0;
			
 
				-
			
 
				-#if TRACE
			
 
				-		bytestats = 0;
			
 
				-		vertexstats[k].size += data - olddata;
			
 
				-#endif
			
 
				+			return NULL;
			
 
				 	}
			
 
				 
			
 
				 	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
			
@@ -267,7 +296,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 
				 	return data;
			
 
				 }
			
 
				 
			
 
				-#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON))
			
 
				+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
			
 
				 static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				 {
			
 
				 #define READ() byte = *data++
			
@@ -327,14 +356,14 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
 
				 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
			
 
				 
			
 
				 	if (size_t(data_end - data) < header_size)
			
 
				-		return 0;
			
 
				+		return NULL;
			
 
				 
			
 
				 	data += header_size;
			
 
				 
			
 
				 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
			
 
				 	{
			
 
				-		if (size_t(data_end - data) < kTailMaxSize)
			
 
				-			return 0;
			
 
				+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
			
 
				+			return NULL;
			
 
				 
			
 
				 		size_t header_offset = i / kByteGroupSize;
			
 
				 
			
@@ -359,7 +388,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
 
				 	{
			
 
				 		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
			
 
				 		if (!data)
			
 
				-			return 0;
			
 
				+			return NULL;
			
 
				 
			
 
				 		size_t vertex_offset = k;
			
 
				 
			
@@ -384,11 +413,15 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#if defined(SIMD_SSE) || defined(SIMD_NEON)
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				 static unsigned char kDecodeBytesGroupShuffle[256][8];
			
 
				 static unsigned char kDecodeBytesGroupCount[256];
			
 
				 
			
 
				-static bool decodeBytesGroupBuildTables()
			
 
				+#ifdef __wasm__
			
 
				+__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
			
 
				+#endif
			
 
				+static bool
			
 
				+decodeBytesGroupBuildTables()
			
 
				 {
			
 
				 	for (int mask = 0; mask < 256; ++mask)
			
 
				 	{
			
@@ -413,6 +446,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
 
				 #endif
			
 
				 
			
 
				 #ifdef SIMD_SSE
			
 
				+SIMD_TARGET
			
 
				 static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
			
 
				 {
			
 
				 	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
			
@@ -424,27 +458,7 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 
				 	return _mm_unpacklo_epi64(sm0, sm1r);
			
 
				 }
			
 
				 
			
 
				-static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
			
 
				-{
			
 
				-	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
			
 
				-	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
			
 
				-	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
			
 
				-	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
			
 
				-
			
 
				-	x0 = _mm_unpacklo_epi16(t0, t2);
			
 
				-	x1 = _mm_unpackhi_epi16(t0, t2);
			
 
				-	x2 = _mm_unpacklo_epi16(t1, t3);
			
 
				-	x3 = _mm_unpackhi_epi16(t1, t3);
			
 
				-}
			
 
				-
			
 
				-static __m128i unzigzag8(__m128i v)
			
 
				-{
			
 
				-	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
			
 
				-	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
			
 
				-
			
 
				-	return _mm_xor_si128(xl, xr);
			
 
				-}
			
 
				-
			
 
				+SIMD_TARGET
			
 
				 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				 {
			
 
				 	switch (bitslog2)
			
@@ -466,6 +480,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 		typedef int unaligned_int;
			
 
				 #endif
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned int data32;
			
 
				+		memcpy(&data32, data, 4);
			
 
				+		data32 &= data32 >> 1;
			
 
				+
			
 
				+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
			
 
				+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				 		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
			
 
				 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
			
 
				 
			
@@ -484,11 +510,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 4 + datacnt;
			
 
				+#else
			
 
				 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	case 2:
			
 
				 	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned long long data64;
			
 
				+		memcpy(&data64, data, 8);
			
 
				+		data64 &= data64 >> 1;
			
 
				+		data64 &= data64 >> 2;
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				 		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
			
 
				 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
			
 
				 
			
@@ -506,14 +546,75 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 8 + datacnt;
			
 
				+#else
			
 
				 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	case 3:
			
 
				 	{
			
 
				-		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
			
 
				+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+		return data + 16;
			
 
				+	}
			
 
				+
			
 
				+	default:
			
 
				+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
			
 
				+		return data;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_AVX
			
 
				+static const __m128i decodeBytesGroupConfig[] = {
			
 
				+    _mm_set1_epi8(3),
			
 
				+    _mm_set1_epi8(15),
			
 
				+    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
			
 
				+    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
			
 
				+};
			
 
				+
			
 
				+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				+{
			
 
				+	switch (bitslog2)
			
 
				+	{
			
 
				+	case 0:
			
 
				+	{
			
 
				+		__m128i result = _mm_setzero_si128();
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+		return data;
			
 
				+	}
			
 
				+
			
 
				+	case 1:
			
 
				+	case 2:
			
 
				+	{
			
 
				+		const unsigned char* skip = data + (bitslog2 << 2);
			
 
				+
			
 
				+		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
			
 
				+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
			
 
				+
			
 
				+		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
			
 
				+		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
			
 
				+
			
 
				+		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
			
 
				+		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
			
 
				+		__mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
			
 
				+
			
 
				+		__m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				 
			
 
				-		__m128i result = rest;
			
 
				+		return skip + _mm_popcnt_u32(mask16);
			
 
				+	}
			
 
				+
			
 
				+	case 3:
			
 
				+	{
			
 
				+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
			
 
				 
			
 
				 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				 
			
@@ -541,46 +642,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
 
				 
			
 
				 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
			
 
				 {
			
 
				-	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
			
 
				-
			
 
				-	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
			
 
				-	uint8x16_t masked = vandq_u8(mask, byte_mask);
			
 
				+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
			
 
				+	const uint64_t magic = 0x000103070f1f3f80ull;
			
 
				 
			
 
				-#ifdef __aarch64__
			
 
				-	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
			
 
				-	mask0 = vaddv_u8(vget_low_u8(masked));
			
 
				-	mask1 = vaddv_u8(vget_high_u8(masked));
			
 
				-#else
			
 
				-	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
			
 
				-	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
			
 
				-	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
			
 
				-	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
			
 
				+	uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
			
 
				 
			
 
				-	mask0 = vget_lane_u8(sum3, 0);
			
 
				-	mask1 = vget_lane_u8(sum3, 1);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
			
 
				-{
			
 
				-	uint8x16x2_t t01 = vzipq_u8(x0, x1);
			
 
				-	uint8x16x2_t t23 = vzipq_u8(x2, x3);
			
 
				-
			
 
				-	uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
			
 
				-	uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
			
 
				-
			
 
				-	x0 = vreinterpretq_u8_u16(x01.val[0]);
			
 
				-	x1 = vreinterpretq_u8_u16(x01.val[1]);
			
 
				-	x2 = vreinterpretq_u8_u16(x23.val[0]);
			
 
				-	x3 = vreinterpretq_u8_u16(x23.val[1]);
			
 
				-}
			
 
				-
			
 
				-static uint8x16_t unzigzag8(uint8x16_t v)
			
 
				-{
			
 
				-	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
			
 
				-	uint8x16_t xr = vshrq_n_u8(v, 1);
			
 
				-
			
 
				-	return veorq_u8(xl, xr);
			
 
				+	mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
			
 
				+	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
			
 
				 }
			
 
				 
			
 
				 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
@@ -598,6 +666,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 	case 1:
			
 
				 	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned int data32;
			
 
				+		memcpy(&data32, data, 4);
			
 
				+		data32 &= data32 >> 1;
			
 
				+
			
 
				+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
			
 
				+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				 		uint8x8_t sel2 = vld1_u8(data);
			
 
				 		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
			
 
				 		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
			
@@ -614,11 +694,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 		vst1q_u8(buffer, result);
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 4 + datacnt;
			
 
				+#else
			
 
				 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	case 2:
			
 
				 	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned long long data64;
			
 
				+		memcpy(&data64, data, 8);
			
 
				+		data64 &= data64 >> 1;
			
 
				+		data64 &= data64 >> 2;
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				 		uint8x8_t sel4 = vld1_u8(data);
			
 
				 		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
			
 
				 		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
			
@@ -634,14 +728,16 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 		vst1q_u8(buffer, result);
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 8 + datacnt;
			
 
				+#else
			
 
				 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	case 3:
			
 
				 	{
			
 
				-		uint8x16_t rest = vld1q_u8(data);
			
 
				-
			
 
				-		uint8x16_t result = rest;
			
 
				+		uint8x16_t result = vld1q_u8(data);
			
 
				 
			
 
				 		vst1q_u8(buffer, result);
			
 
				 
			
@@ -655,7 +751,182 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#if defined(SIMD_SSE) || defined(SIMD_NEON)
			
 
				+#ifdef SIMD_WASM
			
 
				+SIMD_TARGET
			
 
				+static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
			
 
				+{
			
 
				+	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
			
 
				+	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
			
 
				+
			
 
				+	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
			
 
				+	sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
			
 
				+
			
 
				+	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
			
 
				+
			
 
				+	return wasmx_unpacklo_v64x2(sm0, sm1r);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
			
 
				+{
			
 
				+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
			
 
				+	const uint64_t magic = 0x000103070f1f3f80ull;
			
 
				+
			
 
				+	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
			
 
				+	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				+{
			
 
				+	switch (bitslog2)
			
 
				+	{
			
 
				+	case 0:
			
 
				+	{
			
 
				+		v128_t result = wasm_i8x16_splat(0);
			
 
				+
			
 
				+		wasm_v128_store(buffer, result);
			
 
				+
			
 
				+		return data;
			
 
				+	}
			
 
				+
			
 
				+	case 1:
			
 
				+	{
			
 
				+		v128_t sel2 = wasm_v128_load(data);
			
 
				+		v128_t rest = wasm_v128_load(data + 4);
			
 
				+
			
 
				+		v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
			
 
				+		v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
			
 
				+		v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
			
 
				+
			
 
				+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
			
 
				+
			
 
				+		unsigned char mask0, mask1;
			
 
				+		wasmMoveMask(mask, mask0, mask1);
			
 
				+
			
 
				+		v128_t shuf = decodeShuffleMask(mask0, mask1);
			
 
				+
			
 
				+		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
			
 
				+
			
 
				+		wasm_v128_store(buffer, result);
			
 
				+
			
 
				+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+	}
			
 
				+
			
 
				+	case 2:
			
 
				+	{
			
 
				+		v128_t sel4 = wasm_v128_load(data);
			
 
				+		v128_t rest = wasm_v128_load(data + 8);
			
 
				+
			
 
				+		v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
			
 
				+		v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
			
 
				+
			
 
				+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
			
 
				+
			
 
				+		unsigned char mask0, mask1;
			
 
				+		wasmMoveMask(mask, mask0, mask1);
			
 
				+
			
 
				+		v128_t shuf = decodeShuffleMask(mask0, mask1);
			
 
				+
			
 
				+		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
			
 
				+
			
 
				+		wasm_v128_store(buffer, result);
			
 
				+
			
 
				+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+	}
			
 
				+
			
 
				+	case 3:
			
 
				+	{
			
 
				+		v128_t result = wasm_v128_load(data);
			
 
				+
			
 
				+		wasm_v128_store(buffer, result);
			
 
				+
			
 
				+		return data + 16;
			
 
				+	}
			
 
				+
			
 
				+	default:
			
 
				+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
			
 
				+		return data;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_AVX)
			
 
				+SIMD_TARGET
			
 
				+static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
			
 
				+{
			
 
				+	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
			
 
				+	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
			
 
				+	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
			
 
				+	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
			
 
				+
			
 
				+	x0 = _mm_unpacklo_epi16(t0, t2);
			
 
				+	x1 = _mm_unpackhi_epi16(t0, t2);
			
 
				+	x2 = _mm_unpacklo_epi16(t1, t3);
			
 
				+	x3 = _mm_unpackhi_epi16(t1, t3);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static __m128i unzigzag8(__m128i v)
			
 
				+{
			
 
				+	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
			
 
				+	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
			
 
				+
			
 
				+	return _mm_xor_si128(xl, xr);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
			
 
				+{
			
 
				+	uint8x16x2_t t01 = vzipq_u8(x0, x1);
			
 
				+	uint8x16x2_t t23 = vzipq_u8(x2, x3);
			
 
				+
			
 
				+	uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
			
 
				+	uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
			
 
				+
			
 
				+	x0 = vreinterpretq_u8_u16(x01.val[0]);
			
 
				+	x1 = vreinterpretq_u8_u16(x01.val[1]);
			
 
				+	x2 = vreinterpretq_u8_u16(x23.val[0]);
			
 
				+	x3 = vreinterpretq_u8_u16(x23.val[1]);
			
 
				+}
			
 
				+
			
 
				+static uint8x16_t unzigzag8(uint8x16_t v)
			
 
				+{
			
 
				+	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
			
 
				+	uint8x16_t xr = vshrq_n_u8(v, 1);
			
 
				+
			
 
				+	return veorq_u8(xl, xr);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+SIMD_TARGET
			
 
				+static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
			
 
				+{
			
 
				+	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
			
 
				+	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
			
 
				+	v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
			
 
				+	v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
			
 
				+
			
 
				+	x0 = wasmx_unpacklo_v16x8(t0, t2);
			
 
				+	x1 = wasmx_unpackhi_v16x8(t0, t2);
			
 
				+	x2 = wasmx_unpacklo_v16x8(t1, t3);
			
 
				+	x3 = wasmx_unpackhi_v16x8(t1, t3);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static v128_t unzigzag8(v128_t v)
			
 
				+{
			
 
				+	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
			
 
				+	v128_t xr = wasm_u8x16_shr(v, 1);
			
 
				+
			
 
				+	return wasm_v128_xor(xl, xr);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+SIMD_TARGET
			
 
				 static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
			
 
				 {
			
 
				 	assert(buffer_size % kByteGroupSize == 0);
			
@@ -667,14 +938,14 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 
				 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
			
 
				 
			
 
				 	if (size_t(data_end - data) < header_size)
			
 
				-		return 0;
			
 
				+		return NULL;
			
 
				 
			
 
				 	data += header_size;
			
 
				 
			
 
				 	size_t i = 0;
			
 
				 
			
 
				-	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=32b
			
 
				-	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kTailMaxSize * 4; i += kByteGroupSize * 4)
			
 
				+	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
			
 
				+	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
			
 
				 	{
			
 
				 		size_t header_offset = i / kByteGroupSize;
			
 
				 		unsigned char header_byte = header[header_offset / 4];
			
@@ -688,8 +959,8 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 
				 	// slow-path: process remaining groups
			
 
				 	for (; i < buffer_size; i += kByteGroupSize)
			
 
				 	{
			
 
				-		if (size_t(data_end - data) < kTailMaxSize)
			
 
				-			return 0;
			
 
				+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
			
 
				+			return NULL;
			
 
				 
			
 
				 		size_t header_offset = i / kByteGroupSize;
			
 
				 
			
@@ -701,6 +972,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 
				 	return data;
			
 
				 }
			
 
				 
			
 
				+SIMD_TARGET
			
 
				 static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
			
 
				 {
			
 
				 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
			
@@ -716,10 +988,10 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 
				 		{
			
 
				 			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
			
 
				 			if (!data)
			
 
				-				return 0;
			
 
				+				return NULL;
			
 
				 		}
			
 
				 
			
 
				-#ifdef SIMD_SSE
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_AVX)
			
 
				 #define TEMP __m128i
			
 
				 #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
			
 
				 #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
			
@@ -735,6 +1007,15 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 
				 #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
			
 
				 #define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
			
 
				 #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+#define TEMP v128_t
			
 
				+#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
			
 
				+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
			
 
				+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
			
 
				+#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
			
 
				+#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
			
 
				 #endif
			
 
				 
			
 
				 		PREP();
			
@@ -790,6 +1071,21 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
			
 
				+static unsigned int getCpuFeatures()
			
 
				+{
			
 
				+	int cpuinfo[4] = {};
			
 
				+#ifdef _MSC_VER
			
 
				+	__cpuid(cpuinfo, 1);
			
 
				+#else
			
 
				+	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
			
 
				+#endif
			
 
				+	return cpuinfo[2];
			
 
				+}
			
 
				+
			
 
				+static unsigned int cpuid = getCpuFeatures();
			
 
				+#endif
			
 
				+
			
 
				 } // namespace meshopt
			
 
				 
			
 
				 size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
			
@@ -799,10 +1095,6 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 
				 	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				 	assert(vertex_size % 4 == 0);
			
 
				 
			
 
				-#if TRACE
			
 
				-	memset(vertexstats, 0, sizeof(vertexstats));
			
 
				-#endif
			
 
				-
			
 
				 	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
			
 
				 
			
 
				 	unsigned char* data = buffer;
			
@@ -811,11 +1103,16 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 
				 	if (size_t(data_end - data) < 1 + vertex_size)
			
 
				 		return 0;
			
 
				 
			
 
				-	*data++ = kVertexHeader;
			
 
				+	int version = gEncodeVertexVersion;
			
 
				 
			
 
				-	unsigned char last_vertex[256] = {};
			
 
				+	*data++ = (unsigned char)(kVertexHeader | version);
			
 
				+
			
 
				+	unsigned char first_vertex[256] = {};
			
 
				 	if (vertex_count > 0)
			
 
				-		memcpy(last_vertex, vertex_data, vertex_size);
			
 
				+		memcpy(first_vertex, vertex_data, vertex_size);
			
 
				+
			
 
				+	unsigned char last_vertex[256] = {};
			
 
				+	memcpy(last_vertex, first_vertex, vertex_size);
			
 
				 
			
 
				 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
			
 
				 
			
@@ -844,34 +1141,12 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 
				 		data += kTailMaxSize - vertex_size;
			
 
				 	}
			
 
				 
			
 
				-	memcpy(data, vertex_data, vertex_size);
			
 
				+	memcpy(data, first_vertex, vertex_size);
			
 
				 	data += vertex_size;
			
 
				 
			
 
				 	assert(data >= buffer + tail_size);
			
 
				 	assert(data <= buffer + buffer_size);
			
 
				 
			
 
				-#if TRACE
			
 
				-	size_t total_size = data - buffer;
			
 
				-
			
 
				-	for (size_t k = 0; k < vertex_size; ++k)
			
 
				-	{
			
 
				-		const Stats& vsk = vertexstats[k];
			
 
				-
			
 
				-		printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
			
 
				-
			
 
				-#if TRACE > 1
			
 
				-		printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
			
 
				-		       int(vsk.header),
			
 
				-		       int(vsk.bitg[0]), int(vsk.bitb[0]),
			
 
				-		       int(vsk.bitg[1]), int(vsk.bitb[1]),
			
 
				-		       int(vsk.bitg[2]), int(vsk.bitb[2]),
			
 
				-		       int(vsk.bitg[3]), int(vsk.bitb[3]));
			
 
				-#endif
			
 
				-
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				 	return data - buffer;
			
 
				 }
			
 
				 
			
@@ -893,6 +1168,13 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
 
				 	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
			
 
				 }
			
 
				 
			
 
				+void meshopt_encodeVertexVersion(int version)
			
 
				+{
			
 
				+	assert(unsigned(version) <= 0);
			
 
				+
			
 
				+	meshopt::gEncodeVertexVersion = version;
			
 
				+}
			
 
				+
			
 
				 int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
			
 
				 {
			
 
				 	using namespace meshopt;
			
@@ -900,20 +1182,19 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 
				 	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				 	assert(vertex_size % 4 == 0);
			
 
				 
			
 
				-	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
			
 
				+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
			
 
				 
			
 
				 #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
			
 
				-	int cpuinfo[4] = {};
			
 
				-	__cpuid(cpuinfo, 1);
			
 
				-	decode = (cpuinfo[2] & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
			
 
				-#elif defined(SIMD_SSE) || defined(SIMD_NEON)
			
 
				+	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
			
 
				+#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				 	decode = decodeVertexBlockSimd;
			
 
				 #else
			
 
				 	decode = decodeVertexBlock;
			
 
				 #endif
			
 
				 
			
 
				-#if defined(SIMD_SSE) || defined(SIMD_NEON)
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				 	assert(gDecodeBytesGroupInitialized);
			
 
				+	(void)gDecodeBytesGroupInitialized;
			
 
				 #endif
			
 
				 
			
 
				 	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
			
@@ -924,7 +1205,13 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 
				 	if (size_t(data_end - data) < 1 + vertex_size)
			
 
				 		return -2;
			
 
				 
			
 
				-	if (*data++ != kVertexHeader)
			
 
				+	unsigned char data_header = *data++;
			
 
				+
			
 
				+	if ((data_header & 0xf0) != kVertexHeader)
			
 
				+		return -1;
			
 
				+
			
 
				+	int version = data_header & 0x0f;
			
 
				+	if (version > 0)
			
 
				 		return -1;
			
 
				 
			
 
				 	unsigned char last_vertex[256];
			
@@ -952,3 +1239,10 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+#undef SIMD_NEON
			
 
				+#undef SIMD_SSE
			
 
				+#undef SIMD_AVX
			
 
				+#undef SIMD_WASM
			
 
				+#undef SIMD_FALLBACK
			
 
				+#undef SIMD_TARGET
			
--- a/ThirdParty/MeshOptimizer/vertexfilter.cpp
+++ b/ThirdParty/MeshOptimizer/vertexfilter.cpp
@@ -0,0 +1,1033 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// The block below auto-detects SIMD ISA that can be used on the target platform
			
 
				+#ifndef MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				+// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
			
 
				+#if defined(__SSE2__)
			
 
				+#define SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
			
 
				+#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
			
 
				+#define SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+// GCC/clang define these when NEON support is available
			
 
				+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// On MSVC, we assume that ARM builds always target NEON-capable devices
			
 
				+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
			
 
				+#if defined(__wasm_simd128__)
			
 
				+#define SIMD_WASM
			
 
				+// Prevent compiling other variant when wasm simd compilation is active
			
 
				+#undef SIMD_NEON
			
 
				+#undef SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+#endif // !MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				+#ifdef SIMD_SSE
			
 
				+#include <emmintrin.h>
			
 
				+#include <stdint.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef _MSC_VER
			
 
				+#include <intrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+#if defined(_MSC_VER) && defined(_M_ARM64)
			
 
				+#include <arm64_neon.h>
			
 
				+#else
			
 
				+#include <arm_neon.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+#undef __DEPRECATED
			
 
				+#include <wasm_simd128.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
			
 
				+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
			
 
				+#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
			
 
				+#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __has_builtin
			
 
				+#define __has_builtin(x) 0
			
 
				+#endif
			
 
				+
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
			
 
				+template <typename T>
			
 
				+static void decodeFilterOct(T* data, size_t count)
			
 
				+{
			
 
				+	const float max = float((1 << (sizeof(T) * 8 - 1)) - 1);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		float x = float(data[i * 4 + 0]);
			
 
				+		float y = float(data[i * 4 + 1]);
			
 
				+		float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y);
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		float t = (z >= 0.f) ? 0.f : z;
			
 
				+
			
 
				+		x += (x >= 0.f) ? t : -t;
			
 
				+		y += (y >= 0.f) ? t : -t;
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		float l = sqrtf(x * x + y * y + z * z);
			
 
				+		float s = max / l;
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f));
			
 
				+		int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f));
			
 
				+		int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f));
			
 
				+
			
 
				+		data[i * 4 + 0] = T(xf);
			
 
				+		data[i * 4 + 1] = T(yf);
			
 
				+		data[i * 4 + 2] = T(zf);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuat(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		// recover scale from the high byte of the component
			
 
				+		int sf = data[i * 4 + 3] | 3;
			
 
				+		float ss = scale / float(sf);
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		float x = float(data[i * 4 + 0]) * ss;
			
 
				+		float y = float(data[i * 4 + 1]) * ss;
			
 
				+		float z = float(data[i * 4 + 2]) * ss;
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		float ww = 1.f - x * x - y * y - z * z;
			
 
				+		float w = sqrtf(ww >= 0.f ? ww : 0.f);
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
			
 
				+		int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
			
 
				+		int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
			
 
				+		int wf = int(w * 32767.f + 0.5f);
			
 
				+
			
 
				+		int qc = data[i * 4 + 3] & 3;
			
 
				+
			
 
				+		// output order is dictated by input index
			
 
				+		data[i * 4 + ((qc + 1) & 3)] = short(xf);
			
 
				+		data[i * 4 + ((qc + 2) & 3)] = short(yf);
			
 
				+		data[i * 4 + ((qc + 3) & 3)] = short(zf);
			
 
				+		data[i * 4 + ((qc + 0) & 3)] = short(wf);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExp(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		unsigned int v = data[i];
			
 
				+
			
 
				+		// decode mantissa and exponent
			
 
				+		int m = int(v << 8) >> 8;
			
 
				+		int e = int(v) >> 24;
			
 
				+
			
 
				+		union
			
 
				+		{
			
 
				+			float f;
			
 
				+			unsigned int ui;
			
 
				+		} u;
			
 
				+
			
 
				+		// optimized version of ldexp(float(m), e)
			
 
				+		u.ui = unsigned(e + 127) << 23;
			
 
				+		u.f = u.f * float(m);
			
 
				+
			
 
				+		data[i] = u.ui;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+template <typename T>
			
 
				+static void dispatchSimd(void (*process)(T*, size_t), T* data, size_t count, size_t stride)
			
 
				+{
			
 
				+	assert(stride <= 4);
			
 
				+
			
 
				+	size_t count4 = count & ~size_t(3);
			
 
				+	process(data, count4);
			
 
				+
			
 
				+	if (count4 < count)
			
 
				+	{
			
 
				+		T tail[4 * 4] = {}; // max stride 4, max count 4
			
 
				+		size_t tail_size = (count - count4) * stride * sizeof(T);
			
 
				+		assert(tail_size <= sizeof(tail));
			
 
				+
			
 
				+		memcpy(tail, data + count4 * stride, tail_size);
			
 
				+		process(tail, count - count4);
			
 
				+		memcpy(data + count4 * stride, tail, tail_size);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+inline uint64_t rotateleft64(uint64_t v, int x)
			
 
				+{
			
 
				+#if defined(_MSC_VER) && !defined(__clang__)
			
 
				+	return _rotl64(v, x);
			
 
				+#elif defined(__clang__) && __has_builtin(__builtin_rotateleft64)
			
 
				+	return __builtin_rotateleft64(v, x);
			
 
				+#else
			
 
				+	return (v << (x & 63)) | (v >> ((64 - x) & 63));
			
 
				+#endif
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_SSE
			
 
				+static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				+{
			
 
				+	const __m128 sign = _mm_set1_ps(-0.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
			
 
				+		__m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
			
 
				+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		__m128 x = _mm_cvtepi32_ps(xf);
			
 
				+		__m128 y = _mm_cvtepi32_ps(yf);
			
 
				+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
			
 
				+
			
 
				+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
			
 
				+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
			
 
				+		__m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+
			
 
				+		// combine xr/yr/zr into final value
			
 
				+		__m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
			
 
				+		res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterOctSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const __m128 sign = _mm_set1_ps(-0.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
			
 
				+		__m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		__m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
			
 
				+		__m128i yf = _mm_srai_epi32(n4, 16);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we don't need to sign extend it
			
 
				+		__m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
			
 
				+		__m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		__m128 x = _mm_cvtepi32_ps(xf);
			
 
				+		__m128 y = _mm_cvtepi32_ps(yf);
			
 
				+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
			
 
				+
			
 
				+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
			
 
				+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
			
 
				+		__m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+
			
 
				+		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
			
 
				+		__m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
			
 
				+
			
 
				+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
 
				+		__m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
			
 
				+		__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
			
 
				+
			
 
				+		// patch in .w
			
 
				+		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
			
 
				+		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
			
 
				+		__m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		__m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
			
 
				+		__m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
			
 
				+		__m128i yf = _mm_srai_epi32(q4_xy, 16);
			
 
				+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
			
 
				+		__m128i cf = _mm_srai_epi32(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
			
 
				+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
			
 
				+		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
			
 
				+		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
			
 
				+		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
			
 
				+
			
 
				+		__m128 s = _mm_set1_ps(32767.f);
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
			
 
				+
			
 
				+		// mix x/z and w/y to make 16-bit unpack easier
			
 
				+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
			
 
				+		__m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
			
 
				+
			
 
				+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				+		__m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
			
 
				+		__m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
			
 
				+
			
 
				+		// store results to stack so that we can rotate using scalar instructions
			
 
				+		uint64_t res[4];
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
			
 
				+
			
 
				+		// rotate and store
			
 
				+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
			
 
				+
			
 
				+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
			
 
				+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
			
 
				+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
			
 
				+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExpSimd(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
			
 
				+
			
 
				+		// decode exponent into 2^x directly
			
 
				+		__m128i ef = _mm_srai_epi32(v, 24);
			
 
				+		__m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
			
 
				+
			
 
				+		// decode 24-bit mantissa into floating-point value
			
 
				+		__m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
			
 
				+		__m128 m = _mm_cvtepi32_ps(mf);
			
 
				+
			
 
				+		__m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
			
 
				+
			
 
				+		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
			
 
				+inline float32x4_t vsqrtq_f32(float32x4_t x)
			
 
				+{
			
 
				+	float32x4_t r = vrsqrteq_f32(x);
			
 
				+	r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
			
 
				+	return vmulq_f32(r, x);
			
 
				+}
			
 
				+
			
 
				+inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
			
 
				+{
			
 
				+	float32x4_t r = vrecpeq_f32(y);
			
 
				+	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
			
 
				+	return vmulq_f32(x, r);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				+{
			
 
				+	const int32x4_t sign = vdupq_n_s32(0x80000000);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
			
 
				+		int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
			
 
				+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		float32x4_t x = vcvtq_f32_s32(xf);
			
 
				+		float32x4_t y = vcvtq_f32_s32(yf);
			
 
				+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
			
 
				+
			
 
				+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
			
 
				+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
			
 
				+		float32x4_t rl = vrsqrteq_f32(ll);
			
 
				+		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+
			
 
				+		// combine xr/yr/zr into final value
			
 
				+		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
			
 
				+		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
			
 
				+
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterOctSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const int32x4_t sign = vdupq_n_s32(0x80000000);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
			
 
				+		int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
			
 
				+		int32x4_t yf = vshrq_n_s32(n4, 16);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we don't need to sign extend it
			
 
				+		int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
			
 
				+		int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		float32x4_t x = vcvtq_f32_s32(xf);
			
 
				+		float32x4_t y = vcvtq_f32_s32(yf);
			
 
				+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
			
 
				+
			
 
				+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
			
 
				+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
			
 
				+		float32x4_t rl = vrsqrteq_f32(ll);
			
 
				+		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
			
 
				+		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+
			
 
				+		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
			
 
				+		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
			
 
				+
			
 
				+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
 
				+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
			
 
				+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
			
 
				+
			
 
				+		// patch in .w
			
 
				+		res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
			
 
				+		res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
			
 
				+
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
			
 
				+		int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
			
 
				+		int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
			
 
				+		int32x4_t yf = vshrq_n_s32(q4_xy, 16);
			
 
				+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
			
 
				+		int32x4_t cf = vshrq_n_s32(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
			
 
				+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
			
 
				+		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
			
 
				+		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
			
 
				+		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
			
 
				+
			
 
				+		float32x4_t s = vdupq_n_f32(32767.f);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
			
 
				+
			
 
				+		// mix x/z and w/y to make 16-bit unpack easier
			
 
				+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
			
 
				+		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
			
 
				+
			
 
				+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
			
 
				+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
			
 
				+
			
 
				+		// rotate and store
			
 
				+		uint64_t* out = (uint64_t*)&data[i * 4];
			
 
				+
			
 
				+		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
			
 
				+		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
			
 
				+		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
			
 
				+		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExpSimd(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
			
 
				+
			
 
				+		// decode exponent into 2^x directly
			
 
				+		int32x4_t ef = vshrq_n_s32(v, 24);
			
 
				+		int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
			
 
				+
			
 
				+		// decode 24-bit mantissa into floating-point value
			
 
				+		int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
			
 
				+		float32x4_t m = vcvtq_f32_s32(mf);
			
 
				+
			
 
				+		float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
			
 
				+
			
 
				+		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				+{
			
 
				+	const v128_t sign = wasm_f32x4_splat(-0.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t n4 = wasm_v128_load(&data[i * 4]);
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
			
 
				+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24);
			
 
				+		v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
			
 
				+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		v128_t x = wasm_f32x4_convert_i32x4(xf);
			
 
				+		v128_t y = wasm_f32x4_convert_i32x4(yf);
			
 
				+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		// note: i32x4_min with 0 is equvalent to f32x4_min
			
 
				+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
			
 
				+
			
 
				+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
			
 
				+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
			
 
				+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
			
 
				+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				+
			
 
				+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
			
 
				+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
			
 
				+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
			
 
				+
			
 
				+		// combine xr/yr/zr into final value
			
 
				+		v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
			
 
				+		res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff)));
			
 
				+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8));
			
 
				+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16));
			
 
				+
			
 
				+		wasm_v128_store(&data[i * 4], res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterOctSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const v128_t sign = wasm_f32x4_splat(-0.f);
			
 
				+	const v128_t zmask = wasm_i32x4_splat(0x7fff);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]);
			
 
				+		v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]);
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16);
			
 
				+		v128_t yf = wasm_i32x4_shr(n4, 16);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we don't need to sign extend it
			
 
				+		v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
			
 
				+		v128_t zf = wasm_v128_and(z4, zmask);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		v128_t x = wasm_f32x4_convert_i32x4(xf);
			
 
				+		v128_t y = wasm_f32x4_convert_i32x4(yf);
			
 
				+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		// note: i32x4_min with 0 is equvalent to f32x4_min
			
 
				+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
			
 
				+
			
 
				+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
			
 
				+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
			
 
				+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				+
			
 
				+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
			
 
				+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
			
 
				+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
			
 
				+
			
 
				+		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
			
 
				+		v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff));
			
 
				+
			
 
				+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
 
				+		v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
			
 
				+		v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
			
 
				+
			
 
				+		// patch in .w
			
 
				+		res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
			
 
				+		res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
			
 
				+
			
 
				+		wasm_v128_store(&data[(i + 0) * 4], res_0);
			
 
				+		wasm_v128_store(&data[(i + 2) * 4], res_1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]);
			
 
				+		v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]);
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
			
 
				+		v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
			
 
				+		v128_t yf = wasm_i32x4_shr(q4_xy, 16);
			
 
				+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
			
 
				+		v128_t cf = wasm_i32x4_shr(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
			
 
				+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
			
 
				+		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
			
 
				+		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		// note: i32x4_max with 0 is equivalent to f32x4_max
			
 
				+		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
			
 
				+		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
			
 
				+
			
 
				+		v128_t s = wasm_f32x4_splat(32767.f);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				+
			
 
				+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
			
 
				+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
			
 
				+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
			
 
				+		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
			
 
				+
			
 
				+		// mix x/z and w/y to make 16-bit unpack easier
			
 
				+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
			
 
				+		v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16));
			
 
				+
			
 
				+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				+		v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
			
 
				+		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
			
 
				+
			
 
				+		// compute component index shifted left by 4 (and moved into i32x4 slot)
			
 
				+		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
			
 
				+		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
			
 
				+
			
 
				+		// rotate and store
			
 
				+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
			
 
				+
			
 
				+		out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
			
 
				+		out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
			
 
				+		out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
			
 
				+		out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExpSimd(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t v = wasm_v128_load(&data[i]);
			
 
				+
			
 
				+		// decode exponent into 2^x directly
			
 
				+		v128_t ef = wasm_i32x4_shr(v, 24);
			
 
				+		v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
			
 
				+
			
 
				+		// decode 24-bit mantissa into floating-point value
			
 
				+		v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
			
 
				+		v128_t m = wasm_f32x4_convert_i32x4(mf);
			
 
				+
			
 
				+		v128_t r = wasm_f32x4_mul(es, m);
			
 
				+
			
 
				+		wasm_v128_store(&data[i], r);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+// optimized variant of frexp
			
 
				+inline int optlog2(float v)
			
 
				+{
			
 
				+	union
			
 
				+	{
			
 
				+		float f;
			
 
				+		unsigned int ui;
			
 
				+	} u;
			
 
				+
			
 
				+	u.f = v;
			
 
				+	// +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code
			
 
				+	return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
			
 
				+}
			
 
				+
			
 
				+// optimized variant of ldexp
			
 
				+inline float optexp2(int e)
			
 
				+{
			
 
				+	union
			
 
				+	{
			
 
				+		float f;
			
 
				+		unsigned int ui;
			
 
				+	} u;
			
 
				+
			
 
				+	u.ui = unsigned(e + 127) << 23;
			
 
				+	return u.f;
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride == 4 || stride == 8);
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	if (stride == 4)
			
 
				+		dispatchSimd(decodeFilterOctSimd, static_cast<signed char*>(buffer), count, 4);
			
 
				+	else
			
 
				+		dispatchSimd(decodeFilterOctSimd, static_cast<short*>(buffer), count, 4);
			
 
				+#else
			
 
				+	if (stride == 4)
			
 
				+		decodeFilterOct(static_cast<signed char*>(buffer), count);
			
 
				+	else
			
 
				+		decodeFilterOct(static_cast<short*>(buffer), count);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride == 8);
			
 
				+	(void)stride;
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	dispatchSimd(decodeFilterQuatSimd, static_cast<short*>(buffer), count, 4);
			
 
				+#else
			
 
				+	decodeFilterQuat(static_cast<short*>(buffer), count);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride > 0 && stride % 4 == 0);
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	dispatchSimd(decodeFilterExpSimd, static_cast<unsigned int*>(buffer), count * (stride / 4), 1);
			
 
				+#else
			
 
				+	decodeFilterExp(static_cast<unsigned int*>(buffer), count * (stride / 4));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
			
 
				+{
			
 
				+	assert(stride == 4 || stride == 8);
			
 
				+	assert(bits >= 1 && bits <= 16);
			
 
				+
			
 
				+	signed char* d8 = static_cast<signed char*>(destination);
			
 
				+	short* d16 = static_cast<short*>(destination);
			
 
				+
			
 
				+	int bytebits = int(stride * 2);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* n = &data[i * 4];
			
 
				+
			
 
				+		// octahedral encoding of a unit vector
			
 
				+		float nx = n[0], ny = n[1], nz = n[2], nw = n[3];
			
 
				+		float nl = fabsf(nx) + fabsf(ny) + fabsf(nz);
			
 
				+		float ns = nl == 0.f ? 0.f : 1.f / nl;
			
 
				+
			
 
				+		nx *= ns;
			
 
				+		ny *= ns;
			
 
				+
			
 
				+		float u = (nz >= 0.f) ? nx : (1 - fabsf(ny)) * (nx >= 0.f ? 1.f : -1.f);
			
 
				+		float v = (nz >= 0.f) ? ny : (1 - fabsf(nx)) * (ny >= 0.f ? 1.f : -1.f);
			
 
				+
			
 
				+		int fu = meshopt_quantizeSnorm(u, bits);
			
 
				+		int fv = meshopt_quantizeSnorm(v, bits);
			
 
				+		int fo = meshopt_quantizeSnorm(1.f, bits);
			
 
				+		int fw = meshopt_quantizeSnorm(nw, bytebits);
			
 
				+
			
 
				+		if (stride == 4)
			
 
				+		{
			
 
				+			d8[i * 4 + 0] = (signed char)(fu);
			
 
				+			d8[i * 4 + 1] = (signed char)(fv);
			
 
				+			d8[i * 4 + 2] = (signed char)(fo);
			
 
				+			d8[i * 4 + 3] = (signed char)(fw);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			d16[i * 4 + 0] = short(fu);
			
 
				+			d16[i * 4 + 1] = short(fv);
			
 
				+			d16[i * 4 + 2] = short(fo);
			
 
				+			d16[i * 4 + 3] = short(fw);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_encodeFilterQuat(void* destination_, size_t count, size_t stride, int bits, const float* data)
			
 
				+{
			
 
				+	assert(stride == 8);
			
 
				+	assert(bits >= 4 && bits <= 16);
			
 
				+	(void)stride;
			
 
				+
			
 
				+	short* destination = static_cast<short*>(destination_);
			
 
				+
			
 
				+	const float scaler = sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* q = &data[i * 4];
			
 
				+		short* d = &destination[i * 4];
			
 
				+
			
 
				+		// establish maximum quaternion component
			
 
				+		int qc = 0;
			
 
				+		qc = fabsf(q[1]) > fabsf(q[qc]) ? 1 : qc;
			
 
				+		qc = fabsf(q[2]) > fabsf(q[qc]) ? 2 : qc;
			
 
				+		qc = fabsf(q[3]) > fabsf(q[qc]) ? 3 : qc;
			
 
				+
			
 
				+		// we use double-cover properties to discard the sign
			
 
				+		float sign = q[qc] < 0.f ? -1.f : 1.f;
			
 
				+
			
 
				+		// note: we always encode a cyclical swizzle to be able to recover the order via rotation
			
 
				+		d[0] = short(meshopt_quantizeSnorm(q[(qc + 1) & 3] * scaler * sign, bits));
			
 
				+		d[1] = short(meshopt_quantizeSnorm(q[(qc + 2) & 3] * scaler * sign, bits));
			
 
				+		d[2] = short(meshopt_quantizeSnorm(q[(qc + 3) & 3] * scaler * sign, bits));
			
 
				+		d[3] = short((meshopt_quantizeSnorm(1.f, bits) & ~3) | qc);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride > 0 && stride % 4 == 0 && stride <= 256);
			
 
				+	assert(bits >= 1 && bits <= 24);
			
 
				+
			
 
				+	unsigned int* destination = static_cast<unsigned int*>(destination_);
			
 
				+	size_t stride_float = stride / sizeof(float);
			
 
				+
			
 
				+	int component_exp[64];
			
 
				+	assert(stride_float <= sizeof(component_exp) / sizeof(int));
			
 
				+
			
 
				+	const int min_exp = -100;
			
 
				+
			
 
				+	if (mode == meshopt_EncodeExpSharedComponent)
			
 
				+	{
			
 
				+		for (size_t j = 0; j < stride_float; ++j)
			
 
				+			component_exp[j] = min_exp;
			
 
				+
			
 
				+		for (size_t i = 0; i < count; ++i)
			
 
				+		{
			
 
				+			const float* v = &data[i * stride_float];
			
 
				+
			
 
				+			// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
			
 
				+			for (size_t j = 0; j < stride_float; ++j)
			
 
				+			{
			
 
				+				int e = optlog2(v[j]);
			
 
				+
			
 
				+				component_exp[j] = (component_exp[j] < e) ? e : component_exp[j];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* v = &data[i * stride_float];
			
 
				+		unsigned int* d = &destination[i * stride_float];
			
 
				+
			
 
				+		int vector_exp = min_exp;
			
 
				+
			
 
				+		if (mode == meshopt_EncodeExpSharedVector)
			
 
				+		{
			
 
				+			// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
			
 
				+			for (size_t j = 0; j < stride_float; ++j)
			
 
				+			{
			
 
				+				int e = optlog2(v[j]);
			
 
				+
			
 
				+				vector_exp = (vector_exp < e) ? e : vector_exp;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (mode == meshopt_EncodeExpSeparate)
			
 
				+		{
			
 
				+			for (size_t j = 0; j < stride_float; ++j)
			
 
				+			{
			
 
				+				int e = optlog2(v[j]);
			
 
				+
			
 
				+				component_exp[j] = (min_exp < e) ? e : min_exp;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		for (size_t j = 0; j < stride_float; ++j)
			
 
				+		{
			
 
				+			int exp = (mode == meshopt_EncodeExpSharedVector) ? vector_exp : component_exp[j];
			
 
				+
			
 
				+			// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
			
 
				+			exp -= (bits - 1);
			
 
				+
			
 
				+			// compute renormalized rounded mantissa for each component
			
 
				+			int mmask = (1 << 24) - 1;
			
 
				+
			
 
				+			int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f));
			
 
				+
			
 
				+			d[j] = (m & mmask) | (unsigned(exp) << 24);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#undef SIMD_SSE
			
 
				+#undef SIMD_NEON
			
 
				+#undef SIMD_WASM