9 months ago · e5c012d1dc
--- a/include/meshoptimizer/allocator.cpp
+++ b/include/meshoptimizer/allocator.cpp
@@ -0,0 +1,8 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
			
 
				+{
			
 
				+	meshopt_Allocator::Storage::allocate = allocate;
			
 
				+	meshopt_Allocator::Storage::deallocate = deallocate;
			
 
				+}
			
--- a/include/meshoptimizer/clusterizer.cpp
+++ b/include/meshoptimizer/clusterizer.cpp
@@ -0,0 +1,977 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <float.h>
			
 
				+#include <math.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// This work is based on:
			
 
				+// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
			
 
				+// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
			
 
				+// Jack Ritter. An Efficient Bounding Sphere. 1990
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
			
 
				+const size_t kMeshletMaxVertices = 255;
			
 
				+
			
 
				+// A reasonable limit is around 2*max_vertices or less
			
 
				+const size_t kMeshletMaxTriangles = 512;
			
 
				+
			
 
				+struct TriangleAdjacency2
			
 
				+{
			
 
				+	unsigned int* counts;
			
 
				+	unsigned int* offsets;
			
 
				+	unsigned int* data;
			
 
				+};
			
 
				+
			
 
				+static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
			
 
				+{
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	// allocate arrays
			
 
				+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	adjacency.data = allocator.allocate<unsigned int>(index_count);
			
 
				+
			
 
				+	// fill triangle counts
			
 
				+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		assert(indices[i] < vertex_count);
			
 
				+
			
 
				+		adjacency.counts[indices[i]]++;
			
 
				+	}
			
 
				+
			
 
				+	// fill offset table
			
 
				+	unsigned int offset = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		adjacency.offsets[i] = offset;
			
 
				+		offset += adjacency.counts[i];
			
 
				+	}
			
 
				+
			
 
				+	assert(offset == index_count);
			
 
				+
			
 
				+	// fill triangle data
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
			
 
				+
			
 
				+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
			
 
				+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
			
 
				+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
			
 
				+	}
			
 
				+
			
 
				+	// fix offsets that have been disturbed by the previous pass
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
			
 
				+
			
 
				+		adjacency.offsets[i] -= adjacency.counts[i];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
			
 
				+{
			
 
				+	assert(count > 0);
			
 
				+
			
 
				+	// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
			
 
				+	size_t pmin[3] = {0, 0, 0};
			
 
				+	size_t pmax[3] = {0, 0, 0};
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* p = points[i];
			
 
				+
			
 
				+		for (int axis = 0; axis < 3; ++axis)
			
 
				+		{
			
 
				+			pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
			
 
				+			pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// find the pair of points with largest distance
			
 
				+	float paxisd2 = 0;
			
 
				+	int paxis = 0;
			
 
				+
			
 
				+	for (int axis = 0; axis < 3; ++axis)
			
 
				+	{
			
 
				+		const float* p1 = points[pmin[axis]];
			
 
				+		const float* p2 = points[pmax[axis]];
			
 
				+
			
 
				+		float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
			
 
				+
			
 
				+		if (d2 > paxisd2)
			
 
				+		{
			
 
				+			paxisd2 = d2;
			
 
				+			paxis = axis;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// use the longest segment as the initial sphere diameter
			
 
				+	const float* p1 = points[pmin[paxis]];
			
 
				+	const float* p2 = points[pmax[paxis]];
			
 
				+
			
 
				+	float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
			
 
				+	float radius = sqrtf(paxisd2) / 2;
			
 
				+
			
 
				+	// iteratively adjust the sphere up until all points fit
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* p = points[i];
			
 
				+		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
			
 
				+
			
 
				+		if (d2 > radius * radius)
			
 
				+		{
			
 
				+			float d = sqrtf(d2);
			
 
				+			assert(d > 0);
			
 
				+
			
 
				+			float k = 0.5f + (radius / d) / 2;
			
 
				+
			
 
				+			center[0] = center[0] * k + p[0] * (1 - k);
			
 
				+			center[1] = center[1] * k + p[1] * (1 - k);
			
 
				+			center[2] = center[2] * k + p[2] * (1 - k);
			
 
				+			radius = (radius + d) / 2;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	result[0] = center[0];
			
 
				+	result[1] = center[1];
			
 
				+	result[2] = center[2];
			
 
				+	result[3] = radius;
			
 
				+}
			
 
				+
			
 
				+struct Cone
			
 
				+{
			
 
				+	float px, py, pz;
			
 
				+	float nx, ny, nz;
			
 
				+};
			
 
				+
			
 
				+static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
			
 
				+{
			
 
				+	float cone = 1.f - spread * cone_weight;
			
 
				+	float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
			
 
				+
			
 
				+	return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
			
 
				+}
			
 
				+
			
 
				+static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
			
 
				+{
			
 
				+	Cone result = acc;
			
 
				+
			
 
				+	float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
			
 
				+
			
 
				+	result.px *= center_scale;
			
 
				+	result.py *= center_scale;
			
 
				+	result.pz *= center_scale;
			
 
				+
			
 
				+	float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
			
 
				+	float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
			
 
				+
			
 
				+	result.nx *= axis_scale;
			
 
				+	result.ny *= axis_scale;
			
 
				+	result.nz *= axis_scale;
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	(void)vertex_count;
			
 
				+
			
 
				+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	float mesh_area = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		const float* p0 = vertex_positions + vertex_stride_float * a;
			
 
				+		const float* p1 = vertex_positions + vertex_stride_float * b;
			
 
				+		const float* p2 = vertex_positions + vertex_stride_float * c;
			
 
				+
			
 
				+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
			
 
				+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
			
 
				+
			
 
				+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
			
 
				+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
			
 
				+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
			
 
				+
			
 
				+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
			
 
				+		float invarea = (area == 0.f) ? 0.f : 1.f / area;
			
 
				+
			
 
				+		triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
			
 
				+		triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
			
 
				+		triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
			
 
				+
			
 
				+		triangles[i].nx = normalx * invarea;
			
 
				+		triangles[i].ny = normaly * invarea;
			
 
				+		triangles[i].nz = normalz * invarea;
			
 
				+
			
 
				+		mesh_area += area;
			
 
				+	}
			
 
				+
			
 
				+	return mesh_area;
			
 
				+}
			
 
				+
			
 
				+static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
			
 
				+{
			
 
				+	size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
			
 
				+
			
 
				+	// fill 4b padding with 0
			
 
				+	while (offset & 3)
			
 
				+		meshlet_triangles[offset++] = 0;
			
 
				+}
			
 
				+
			
 
				+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
			
 
				+{
			
 
				+	unsigned char& av = used[a];
			
 
				+	unsigned char& bv = used[b];
			
 
				+	unsigned char& cv = used[c];
			
 
				+
			
 
				+	bool result = false;
			
 
				+
			
 
				+	int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
			
 
				+
			
 
				+	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
			
 
				+	{
			
 
				+		meshlets[meshlet_offset] = meshlet;
			
 
				+
			
 
				+		for (size_t j = 0; j < meshlet.vertex_count; ++j)
			
 
				+			used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
			
 
				+
			
 
				+		finishMeshlet(meshlet, meshlet_triangles);
			
 
				+
			
 
				+		meshlet.vertex_offset += meshlet.vertex_count;
			
 
				+		meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
			
 
				+		meshlet.vertex_count = 0;
			
 
				+		meshlet.triangle_count = 0;
			
 
				+
			
 
				+		result = true;
			
 
				+	}
			
 
				+
			
 
				+	if (av == 0xff)
			
 
				+	{
			
 
				+		av = (unsigned char)meshlet.vertex_count;
			
 
				+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
			
 
				+	}
			
 
				+
			
 
				+	if (bv == 0xff)
			
 
				+	{
			
 
				+		bv = (unsigned char)meshlet.vertex_count;
			
 
				+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
			
 
				+	}
			
 
				+
			
 
				+	if (cv == 0xff)
			
 
				+	{
			
 
				+		cv = (unsigned char)meshlet.vertex_count;
			
 
				+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
			
 
				+	}
			
 
				+
			
 
				+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
			
 
				+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
			
 
				+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
			
 
				+	meshlet.triangle_count++;
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight)
			
 
				+{
			
 
				+	unsigned int best_triangle = ~0u;
			
 
				+	int best_priority = 5;
			
 
				+	float best_score = FLT_MAX;
			
 
				+
			
 
				+	for (size_t i = 0; i < meshlet.vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
			
 
				+
			
 
				+		unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
			
 
				+		size_t neighbors_size = adjacency.counts[index];
			
 
				+
			
 
				+		for (size_t j = 0; j < neighbors_size; ++j)
			
 
				+		{
			
 
				+			unsigned int triangle = neighbors[j];
			
 
				+			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
			
 
				+
			
 
				+			int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
			
 
				+			assert(extra <= 2);
			
 
				+
			
 
				+			int priority = -1;
			
 
				+
			
 
				+			// triangles that don't add new vertices to meshlets are max. priority
			
 
				+			if (extra == 0)
			
 
				+				priority = 0;
			
 
				+			// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
			
 
				+			else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
			
 
				+				priority = 1;
			
 
				+			// if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
			
 
				+			else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
			
 
				+				priority = 1 + extra;
			
 
				+			// otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
			
 
				+			else
			
 
				+				priority = 2 + extra;
			
 
				+
			
 
				+			// since topology-based priority is always more important than the score, we can skip scoring in some cases
			
 
				+			if (priority > best_priority)
			
 
				+				continue;
			
 
				+
			
 
				+			float score = 0;
			
 
				+
			
 
				+			// caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
			
 
				+			if (meshlet_cone)
			
 
				+			{
			
 
				+				const Cone& tri_cone = triangles[triangle];
			
 
				+
			
 
				+				float distance2 =
			
 
				+				    (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
			
 
				+				    (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
			
 
				+				    (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
			
 
				+
			
 
				+				float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
			
 
				+
			
 
				+				score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// each live_triangles entry is >= 1 since it includes the current triangle we're processing
			
 
				+				score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
			
 
				+			}
			
 
				+
			
 
				+			// note that topology-based priority is always more important than the score
			
 
				+			// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
			
 
				+			if (priority < best_priority || score < best_score)
			
 
				+			{
			
 
				+				best_triangle = triangle;
			
 
				+				best_priority = priority;
			
 
				+				best_score = score;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return best_triangle;
			
 
				+}
			
 
				+
			
 
				+struct KDNode
			
 
				+{
			
 
				+	union
			
 
				+	{
			
 
				+		float split;
			
 
				+		unsigned int index;
			
 
				+	};
			
 
				+
			
 
				+	// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
			
 
				+	// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
			
 
				+	unsigned int axis : 2;
			
 
				+	unsigned int children : 30;
			
 
				+};
			
 
				+
			
 
				+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
			
 
				+{
			
 
				+	size_t m = 0;
			
 
				+
			
 
				+	// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		float v = points[indices[i] * stride + axis];
			
 
				+
			
 
				+		// swap(m, i) unconditionally
			
 
				+		unsigned int t = indices[m];
			
 
				+		indices[m] = indices[i];
			
 
				+		indices[i] = t;
			
 
				+
			
 
				+		// when v >= pivot, we swap i with m without advancing it, preserving invariants
			
 
				+		m += v < pivot;
			
 
				+	}
			
 
				+
			
 
				+	return m;
			
 
				+}
			
 
				+
			
 
				+static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
			
 
				+{
			
 
				+	assert(offset + count <= node_count);
			
 
				+	(void)node_count;
			
 
				+
			
 
				+	KDNode& result = nodes[offset];
			
 
				+
			
 
				+	result.index = indices[0];
			
 
				+	result.axis = 3;
			
 
				+	result.children = unsigned(count - 1);
			
 
				+
			
 
				+	// all remaining points are stored in nodes immediately following the leaf
			
 
				+	for (size_t i = 1; i < count; ++i)
			
 
				+	{
			
 
				+		KDNode& tail = nodes[offset + i];
			
 
				+
			
 
				+		tail.index = indices[i];
			
 
				+		tail.axis = 3;
			
 
				+		tail.children = ~0u >> 2; // bogus value to prevent misuse
			
 
				+	}
			
 
				+
			
 
				+	return offset + count;
			
 
				+}
			
 
				+
			
 
				+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
			
 
				+{
			
 
				+	assert(count > 0);
			
 
				+	assert(offset < node_count);
			
 
				+
			
 
				+	if (count <= leaf_size)
			
 
				+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
			
 
				+
			
 
				+	float mean[3] = {};
			
 
				+	float vars[3] = {};
			
 
				+	float runc = 1, runs = 1;
			
 
				+
			
 
				+	// gather statistics on the points in the subtree using Welford's algorithm
			
 
				+	for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
			
 
				+	{
			
 
				+		const float* point = points + indices[i] * stride;
			
 
				+
			
 
				+		for (int k = 0; k < 3; ++k)
			
 
				+		{
			
 
				+			float delta = point[k] - mean[k];
			
 
				+			mean[k] += delta * runs;
			
 
				+			vars[k] += delta * (point[k] - mean[k]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// split axis is one where the variance is largest
			
 
				+	unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
			
 
				+
			
 
				+	float split = mean[axis];
			
 
				+	size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
			
 
				+
			
 
				+	// when the partition is degenerate simply consolidate the points into a single node
			
 
				+	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
			
 
				+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
			
 
				+
			
 
				+	KDNode& result = nodes[offset];
			
 
				+
			
 
				+	result.split = split;
			
 
				+	result.axis = axis;
			
 
				+
			
 
				+	// left subtree is right after our node
			
 
				+	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
			
 
				+
			
 
				+	// distance to the right subtree is represented explicitly
			
 
				+	result.children = unsigned(next_offset - offset - 1);
			
 
				+
			
 
				+	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
			
 
				+}
			
 
				+
			
 
				+static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
			
 
				+{
			
 
				+	const KDNode& node = nodes[root];
			
 
				+
			
 
				+	if (node.axis == 3)
			
 
				+	{
			
 
				+		// leaf
			
 
				+		for (unsigned int i = 0; i <= node.children; ++i)
			
 
				+		{
			
 
				+			unsigned int index = nodes[root + i].index;
			
 
				+
			
 
				+			if (emitted_flags[index])
			
 
				+				continue;
			
 
				+
			
 
				+			const float* point = points + index * stride;
			
 
				+
			
 
				+			float distance2 =
			
 
				+			    (point[0] - position[0]) * (point[0] - position[0]) +
			
 
				+			    (point[1] - position[1]) * (point[1] - position[1]) +
			
 
				+			    (point[2] - position[2]) * (point[2] - position[2]);
			
 
				+			float distance = sqrtf(distance2);
			
 
				+
			
 
				+			if (distance < limit)
			
 
				+			{
			
 
				+				result = index;
			
 
				+				limit = distance;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// branch; we order recursion to process the node that search position is in first
			
 
				+		float delta = position[node.axis] - node.split;
			
 
				+		unsigned int first = (delta <= 0) ? 0 : node.children;
			
 
				+		unsigned int second = first ^ node.children;
			
 
				+
			
 
				+		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
			
 
				+
			
 
				+		// only process the other node if it can have a match based on closest distance so far
			
 
				+		if (fabsf(delta) <= limit)
			
 
				+			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
			
 
				+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				+
			
 
				+	(void)kMeshletMaxVertices;
			
 
				+	(void)kMeshletMaxTriangles;
			
 
				+
			
 
				+	// meshlet construction is limited by max vertices and max triangles per meshlet
			
 
				+	// the worst case is that the input is an unindexed stream since this equally stresses both limits
			
 
				+	// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
			
 
				+	size_t max_vertices_conservative = max_vertices - 2;
			
 
				+	size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
			
 
				+	size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
			
 
				+
			
 
				+	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
			
 
				+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				+
			
 
				+	assert(cone_weight >= 0 && cone_weight <= 1);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	TriangleAdjacency2 adjacency = {};
			
 
				+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
			
 
				+
			
 
				+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
			
 
				+	memset(emitted_flags, 0, face_count);
			
 
				+
			
 
				+	// for each triangle, precompute centroid & normal to use for scoring
			
 
				+	Cone* triangles = allocator.allocate<Cone>(face_count);
			
 
				+	float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+
			
 
				+	// assuming each meshlet is a square patch, expected radius is sqrt(expected area)
			
 
				+	float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
			
 
				+	float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
			
 
				+
			
 
				+	// build a kd-tree for nearest neighbor lookup
			
 
				+	unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+		kdindices[i] = unsigned(i);
			
 
				+
			
 
				+	KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
			
 
				+	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
			
 
				+
			
 
				+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
			
 
				+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
			
 
				+	memset(used, -1, vertex_count);
			
 
				+
			
 
				+	meshopt_Meshlet meshlet = {};
			
 
				+	size_t meshlet_offset = 0;
			
 
				+
			
 
				+	Cone meshlet_cone_acc = {};
			
 
				+
			
 
				+	for (;;)
			
 
				+	{
			
 
				+		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
			
 
				+
			
 
				+		unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
			
 
				+		int best_extra = best_triangle == ~0u ? -1 : (used[indices[best_triangle * 3 + 0]] == 0xff) + (used[indices[best_triangle * 3 + 1]] == 0xff) + (used[indices[best_triangle * 3 + 2]] == 0xff);
			
 
				+
			
 
				+		// if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
			
 
				+		if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
			
 
				+		{
			
 
				+			best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f);
			
 
				+		}
			
 
				+
			
 
				+		// when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
			
 
				+		if (best_triangle == ~0u)
			
 
				+		{
			
 
				+			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
			
 
				+			unsigned int index = ~0u;
			
 
				+			float limit = FLT_MAX;
			
 
				+
			
 
				+			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
			
 
				+
			
 
				+			best_triangle = index;
			
 
				+		}
			
 
				+
			
 
				+		if (best_triangle == ~0u)
			
 
				+			break;
			
 
				+
			
 
				+		unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
			
 
				+		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
			
 
				+		{
			
 
				+			meshlet_offset++;
			
 
				+			memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
			
 
				+		}
			
 
				+
			
 
				+		live_triangles[a]--;
			
 
				+		live_triangles[b]--;
			
 
				+		live_triangles[c]--;
			
 
				+
			
 
				+		// remove emitted triangle from adjacency data
			
 
				+		// this makes sure that we spend less time traversing these lists on subsequent iterations
			
 
				+		for (size_t k = 0; k < 3; ++k)
			
 
				+		{
			
 
				+			unsigned int index = indices[best_triangle * 3 + k];
			
 
				+
			
 
				+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
			
 
				+			size_t neighbors_size = adjacency.counts[index];
			
 
				+
			
 
				+			for (size_t i = 0; i < neighbors_size; ++i)
			
 
				+			{
			
 
				+				unsigned int tri = neighbors[i];
			
 
				+
			
 
				+				if (tri == best_triangle)
			
 
				+				{
			
 
				+					neighbors[i] = neighbors[neighbors_size - 1];
			
 
				+					adjacency.counts[index]--;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// update aggregated meshlet cone data for scoring subsequent triangles
			
 
				+		meshlet_cone_acc.px += triangles[best_triangle].px;
			
 
				+		meshlet_cone_acc.py += triangles[best_triangle].py;
			
 
				+		meshlet_cone_acc.pz += triangles[best_triangle].pz;
			
 
				+		meshlet_cone_acc.nx += triangles[best_triangle].nx;
			
 
				+		meshlet_cone_acc.ny += triangles[best_triangle].ny;
			
 
				+		meshlet_cone_acc.nz += triangles[best_triangle].nz;
			
 
				+
			
 
				+		emitted_flags[best_triangle] = 1;
			
 
				+	}
			
 
				+
			
 
				+	if (meshlet.triangle_count)
			
 
				+	{
			
 
				+		finishMeshlet(meshlet, meshlet_triangles);
			
 
				+
			
 
				+		meshlets[meshlet_offset++] = meshlet;
			
 
				+	}
			
 
				+
			
 
				+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
			
 
				+	return meshlet_offset;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
 
				+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
			
 
				+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
			
 
				+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
			
 
				+	memset(used, -1, vertex_count);
			
 
				+
			
 
				+	meshopt_Meshlet meshlet = {};
			
 
				+	size_t meshlet_offset = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		// appends triangle to the meshlet and writes previous meshlet to the output if full
			
 
				+		meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
			
 
				+	}
			
 
				+
			
 
				+	if (meshlet.triangle_count)
			
 
				+	{
			
 
				+		finishMeshlet(meshlet, meshlet_triangles);
			
 
				+
			
 
				+		meshlets[meshlet_offset++] = meshlet;
			
 
				+	}
			
 
				+
			
 
				+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
			
 
				+	return meshlet_offset;
			
 
				+}
			
 
				+
			
 
				+meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(index_count / 3 <= kMeshletMaxTriangles);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	(void)vertex_count;
			
 
				+
			
 
				+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				+
			
 
				+	// compute triangle normals and gather triangle corners
			
 
				+	float normals[kMeshletMaxTriangles][3];
			
 
				+	float corners[kMeshletMaxTriangles][3][3];
			
 
				+	size_t triangles = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		const float* p0 = vertex_positions + vertex_stride_float * a;
			
 
				+		const float* p1 = vertex_positions + vertex_stride_float * b;
			
 
				+		const float* p2 = vertex_positions + vertex_stride_float * c;
			
 
				+
			
 
				+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
			
 
				+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
			
 
				+
			
 
				+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
			
 
				+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
			
 
				+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
			
 
				+
			
 
				+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
			
 
				+
			
 
				+		// no need to include degenerate triangles - they will be invisible anyway
			
 
				+		if (area == 0.f)
			
 
				+			continue;
			
 
				+
			
 
				+		// record triangle normals & corners for future use; normal and corner 0 define a plane equation
			
 
				+		normals[triangles][0] = normalx / area;
			
 
				+		normals[triangles][1] = normaly / area;
			
 
				+		normals[triangles][2] = normalz / area;
			
 
				+		memcpy(corners[triangles][0], p0, 3 * sizeof(float));
			
 
				+		memcpy(corners[triangles][1], p1, 3 * sizeof(float));
			
 
				+		memcpy(corners[triangles][2], p2, 3 * sizeof(float));
			
 
				+		triangles++;
			
 
				+	}
			
 
				+
			
 
				+	meshopt_Bounds bounds = {};
			
 
				+
			
 
				+	// degenerate cluster, no valid triangles => trivial reject (cone data is 0)
			
 
				+	if (triangles == 0)
			
 
				+		return bounds;
			
 
				+
			
 
				+	// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
			
 
				+	float psphere[4] = {};
			
 
				+	computeBoundingSphere(psphere, corners[0], triangles * 3);
			
 
				+
			
 
				+	float center[3] = {psphere[0], psphere[1], psphere[2]};
			
 
				+
			
 
				+	// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
			
 
				+	float nsphere[4] = {};
			
 
				+	computeBoundingSphere(nsphere, normals, triangles);
			
 
				+
			
 
				+	float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
			
 
				+	float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
			
 
				+	float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
			
 
				+
			
 
				+	axis[0] *= invaxislength;
			
 
				+	axis[1] *= invaxislength;
			
 
				+	axis[2] *= invaxislength;
			
 
				+
			
 
				+	// compute a tight cone around all normals, mindp = cos(angle/2)
			
 
				+	float mindp = 1.f;
			
 
				+
			
 
				+	for (size_t i = 0; i < triangles; ++i)
			
 
				+	{
			
 
				+		float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
			
 
				+
			
 
				+		mindp = (dp < mindp) ? dp : mindp;
			
 
				+	}
			
 
				+
			
 
				+	// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
			
 
				+	bounds.center[0] = center[0];
			
 
				+	bounds.center[1] = center[1];
			
 
				+	bounds.center[2] = center[2];
			
 
				+	bounds.radius = psphere[3];
			
 
				+
			
 
				+	// degenerate cluster, normal cone is larger than a hemisphere => trivial accept
			
 
				+	// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
			
 
				+	// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
			
 
				+	if (mindp <= 0.1f)
			
 
				+	{
			
 
				+		bounds.cone_cutoff = 1;
			
 
				+		bounds.cone_cutoff_s8 = 127;
			
 
				+		return bounds;
			
 
				+	}
			
 
				+
			
 
				+	float maxt = 0;
			
 
				+
			
 
				+	// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
			
 
				+	for (size_t i = 0; i < triangles; ++i)
			
 
				+	{
			
 
				+		// dot(center-t*axis-corner, trinormal) = 0
			
 
				+		// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
			
 
				+		float cx = center[0] - corners[i][0][0];
			
 
				+		float cy = center[1] - corners[i][0][1];
			
 
				+		float cz = center[2] - corners[i][0][2];
			
 
				+
			
 
				+		float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
			
 
				+		float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
			
 
				+
			
 
				+		// dn should be larger than mindp cutoff above
			
 
				+		assert(dn > 0.f);
			
 
				+		float t = dc / dn;
			
 
				+
			
 
				+		maxt = (t > maxt) ? t : maxt;
			
 
				+	}
			
 
				+
			
 
				+	// cone apex should be in the negative half-space of all cluster triangles by construction
			
 
				+	bounds.cone_apex[0] = center[0] - axis[0] * maxt;
			
 
				+	bounds.cone_apex[1] = center[1] - axis[1] * maxt;
			
 
				+	bounds.cone_apex[2] = center[2] - axis[2] * maxt;
			
 
				+
			
 
				+	// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
			
 
				+	bounds.cone_axis[0] = axis[0];
			
 
				+	bounds.cone_axis[1] = axis[1];
			
 
				+	bounds.cone_axis[2] = axis[2];
			
 
				+
			
 
				+	// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
			
 
				+	// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
			
 
				+	bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
			
 
				+
			
 
				+	// quantize axis & cutoff to 8-bit SNORM format
			
 
				+	bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
			
 
				+	bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
			
 
				+	bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
			
 
				+
			
 
				+	// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
			
 
				+	float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
			
 
				+	float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
			
 
				+	float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
			
 
				+
			
 
				+	// note that we need to round this up instead of rounding to nearest, hence +1
			
 
				+	int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
			
 
				+
			
 
				+	bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
			
 
				+
			
 
				+	return bounds;
			
 
				+}
			
 
				+
			
 
				+meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(triangle_count <= kMeshletMaxTriangles);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	unsigned int indices[kMeshletMaxTriangles * 3];
			
 
				+
			
 
				+	for (size_t i = 0; i < triangle_count * 3; ++i)
			
 
				+	{
			
 
				+		unsigned int index = meshlet_vertices[meshlet_triangles[i]];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		indices[i] = index;
			
 
				+	}
			
 
				+
			
 
				+	return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+}
			
 
				+
			
 
				+void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(triangle_count <= kMeshletMaxTriangles);
			
 
				+	assert(vertex_count <= kMeshletMaxVertices);
			
 
				+
			
 
				+	unsigned char* indices = meshlet_triangles;
			
 
				+	unsigned int* vertices = meshlet_vertices;
			
 
				+
			
 
				+	// cache tracks vertex timestamps (corresponding to triangle index! all 3 vertices are added at the same time and never removed)
			
 
				+	unsigned char cache[kMeshletMaxVertices];
			
 
				+	memset(cache, 0, vertex_count);
			
 
				+
			
 
				+	// note that we start from a value that means all vertices aren't in cache
			
 
				+	unsigned char cache_last = 128;
			
 
				+	const unsigned char cache_cutoff = 3; // 3 triangles = ~5..9 vertices depending on reuse
			
 
				+
			
 
				+	for (size_t i = 0; i < triangle_count; ++i)
			
 
				+	{
			
 
				+		int next = -1;
			
 
				+		int next_match = -1;
			
 
				+
			
 
				+		for (size_t j = i; j < triangle_count; ++j)
			
 
				+		{
			
 
				+			unsigned char a = indices[j * 3 + 0], b = indices[j * 3 + 1], c = indices[j * 3 + 2];
			
 
				+			assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+			// score each triangle by how many vertices are in cache
			
 
				+			// note: the distance is computed using unsigned 8-bit values, so cache timestamp overflow is handled gracefully
			
 
				+			int aok = (unsigned char)(cache_last - cache[a]) < cache_cutoff;
			
 
				+			int bok = (unsigned char)(cache_last - cache[b]) < cache_cutoff;
			
 
				+			int cok = (unsigned char)(cache_last - cache[c]) < cache_cutoff;
			
 
				+
			
 
				+			if (aok + bok + cok > next_match)
			
 
				+			{
			
 
				+				next = (int)j;
			
 
				+				next_match = aok + bok + cok;
			
 
				+
			
 
				+				// note that we could end up with all 3 vertices in the cache, but 2 is enough for ~strip traversal
			
 
				+				if (next_match >= 2)
			
 
				+					break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		assert(next >= 0);
			
 
				+
			
 
				+		unsigned char a = indices[next * 3 + 0], b = indices[next * 3 + 1], c = indices[next * 3 + 2];
			
 
				+
			
 
				+		// shift triangles before the next one forward so that we always keep an ordered partition
			
 
				+		// note: this could have swapped triangles [i] and [next] but that distorts the order and may skew the output sequence
			
 
				+		memmove(indices + (i + 1) * 3, indices + i * 3, (next - i) * 3 * sizeof(unsigned char));
			
 
				+
			
 
				+		indices[i * 3 + 0] = a;
			
 
				+		indices[i * 3 + 1] = b;
			
 
				+		indices[i * 3 + 2] = c;
			
 
				+
			
 
				+		// cache timestamp is the same between all vertices of each triangle to reduce overflow
			
 
				+		cache_last++;
			
 
				+		cache[a] = cache_last;
			
 
				+		cache[b] = cache_last;
			
 
				+		cache[c] = cache_last;
			
 
				+	}
			
 
				+
			
 
				+	// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
			
 
				+	unsigned int order[kMeshletMaxVertices];
			
 
				+
			
 
				+	unsigned char remap[kMeshletMaxVertices];
			
 
				+	memset(remap, -1, vertex_count);
			
 
				+
			
 
				+	size_t vertex_offset = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < triangle_count * 3; ++i)
			
 
				+	{
			
 
				+		unsigned char& r = remap[indices[i]];
			
 
				+
			
 
				+		if (r == 0xff)
			
 
				+		{
			
 
				+			r = (unsigned char)(vertex_offset);
			
 
				+			order[vertex_offset] = vertices[indices[i]];
			
 
				+			vertex_offset++;
			
 
				+		}
			
 
				+
			
 
				+		indices[i] = r;
			
 
				+	}
			
 
				+
			
 
				+	assert(vertex_offset <= vertex_count);
			
 
				+	memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
			
 
				+}
			
--- a/include/meshoptimizer/indexcodec.cpp
+++ b/include/meshoptimizer/indexcodec.cpp
@@ -0,0 +1,674 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// This work is based on:
			
 
				+// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
			
 
				+// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+const unsigned char kIndexHeader = 0xe0;
			
 
				+const unsigned char kSequenceHeader = 0xd0;
			
 
				+
			
 
				+static int gEncodeIndexVersion = 1;
			
 
				+
			
 
				+typedef unsigned int VertexFifo[16];
			
 
				+typedef unsigned int EdgeFifo[16][2];
			
 
				+
			
 
				+static const unsigned int kTriangleIndexOrder[3][3] = {
			
 
				+    {0, 1, 2},
			
 
				+    {1, 2, 0},
			
 
				+    {2, 0, 1},
			
 
				+};
			
 
				+
			
 
				+static const unsigned char kCodeAuxEncodingTable[16] = {
			
 
				+    0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69,
			
 
				+    0, 0, // last two entries aren't used for encoding
			
 
				+};
			
 
				+
			
 
				+static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next)
			
 
				+{
			
 
				+	(void)a;
			
 
				+
			
 
				+	return (b == next) ? 1 : (c == next ? 2 : 0);
			
 
				+}
			
 
				+
			
 
				+static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
			
 
				+{
			
 
				+	for (int i = 0; i < 16; ++i)
			
 
				+	{
			
 
				+		size_t index = (offset - 1 - i) & 15;
			
 
				+
			
 
				+		unsigned int e0 = fifo[index][0];
			
 
				+		unsigned int e1 = fifo[index][1];
			
 
				+
			
 
				+		if (e0 == a && e1 == b)
			
 
				+			return (i << 2) | 0;
			
 
				+		if (e0 == b && e1 == c)
			
 
				+			return (i << 2) | 1;
			
 
				+		if (e0 == c && e1 == a)
			
 
				+			return (i << 2) | 2;
			
 
				+	}
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset)
			
 
				+{
			
 
				+	fifo[offset][0] = a;
			
 
				+	fifo[offset][1] = b;
			
 
				+	offset = (offset + 1) & 15;
			
 
				+}
			
 
				+
			
 
				+static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset)
			
 
				+{
			
 
				+	for (int i = 0; i < 16; ++i)
			
 
				+	{
			
 
				+		size_t index = (offset - 1 - i) & 15;
			
 
				+
			
 
				+		if (fifo[index] == v)
			
 
				+			return i;
			
 
				+	}
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1)
			
 
				+{
			
 
				+	fifo[offset] = v;
			
 
				+	offset = (offset + cond) & 15;
			
 
				+}
			
 
				+
			
 
				+static void encodeVByte(unsigned char*& data, unsigned int v)
			
 
				+{
			
 
				+	// encode 32-bit value in up to 5 7-bit groups
			
 
				+	do
			
 
				+	{
			
 
				+		*data++ = (v & 127) | (v > 127 ? 128 : 0);
			
 
				+		v >>= 7;
			
 
				+	} while (v);
			
 
				+}
			
 
				+
			
 
				+static unsigned int decodeVByte(const unsigned char*& data)
			
 
				+{
			
 
				+	unsigned char lead = *data++;
			
 
				+
			
 
				+	// fast path: single byte
			
 
				+	if (lead < 128)
			
 
				+		return lead;
			
 
				+
			
 
				+	// slow path: up to 4 extra bytes
			
 
				+	// note that this loop always terminates, which is important for malformed data
			
 
				+	unsigned int result = lead & 127;
			
 
				+	unsigned int shift = 7;
			
 
				+
			
 
				+	for (int i = 0; i < 4; ++i)
			
 
				+	{
			
 
				+		unsigned char group = *data++;
			
 
				+		result |= unsigned(group & 127) << shift;
			
 
				+		shift += 7;
			
 
				+
			
 
				+		if (group < 128)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last)
			
 
				+{
			
 
				+	unsigned int d = index - last;
			
 
				+	unsigned int v = (d << 1) ^ (int(d) >> 31);
			
 
				+
			
 
				+	encodeVByte(data, v);
			
 
				+}
			
 
				+
			
 
				+static unsigned int decodeIndex(const unsigned char*& data, unsigned int last)
			
 
				+{
			
 
				+	unsigned int v = decodeVByte(data);
			
 
				+	unsigned int d = (v >> 1) ^ -int(v & 1);
			
 
				+
			
 
				+	return last + d;
			
 
				+}
			
 
				+
			
 
				+static int getCodeAuxIndex(unsigned char v, const unsigned char* table)
			
 
				+{
			
 
				+	for (int i = 0; i < 16; ++i)
			
 
				+		if (table[i] == v)
			
 
				+			return i;
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c)
			
 
				+{
			
 
				+	if (index_size == 2)
			
 
				+	{
			
 
				+		static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
			
 
				+		static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
			
 
				+		static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		static_cast<unsigned int*>(destination)[offset + 0] = a;
			
 
				+		static_cast<unsigned int*>(destination)[offset + 1] = b;
			
 
				+		static_cast<unsigned int*>(destination)[offset + 2] = c;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
			
 
				+	if (buffer_size < 1 + index_count / 3 + 16)
			
 
				+		return 0;
			
 
				+
			
 
				+	int version = gEncodeIndexVersion;
			
 
				+
			
 
				+	buffer[0] = (unsigned char)(kIndexHeader | version);
			
 
				+
			
 
				+	EdgeFifo edgefifo;
			
 
				+	memset(edgefifo, -1, sizeof(edgefifo));
			
 
				+
			
 
				+	VertexFifo vertexfifo;
			
 
				+	memset(vertexfifo, -1, sizeof(vertexfifo));
			
 
				+
			
 
				+	size_t edgefifooffset = 0;
			
 
				+	size_t vertexfifooffset = 0;
			
 
				+
			
 
				+	unsigned int next = 0;
			
 
				+	unsigned int last = 0;
			
 
				+
			
 
				+	unsigned char* code = buffer + 1;
			
 
				+	unsigned char* data = code + index_count / 3;
			
 
				+	unsigned char* data_safe_end = buffer + buffer_size - 16;
			
 
				+
			
 
				+	int fecmax = version >= 1 ? 13 : 15;
			
 
				+
			
 
				+	// use static encoding table; it's possible to pack the result and then build an optimal table and repack
			
 
				+	// for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
			
 
				+	const unsigned char* codeaux_table = kCodeAuxEncodingTable;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		// make sure we have enough space to write a triangle
			
 
				+		// each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index
			
 
				+		// after this we can be sure we can write without extra bounds checks
			
 
				+		if (data > data_safe_end)
			
 
				+			return 0;
			
 
				+
			
 
				+		int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset);
			
 
				+
			
 
				+		if (fer >= 0 && (fer >> 2) < 15)
			
 
				+		{
			
 
				+			const unsigned int* order = kTriangleIndexOrder[fer & 3];
			
 
				+
			
 
				+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
			
 
				+
			
 
				+			// encode edge index and vertex fifo index, next or free index
			
 
				+			int fe = fer >> 2;
			
 
				+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
			
 
				+
			
 
				+			int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next ? (next++, 0) : 15);
			
 
				+
			
 
				+			if (fec == 15 && version >= 1)
			
 
				+			{
			
 
				+				// encode last-1 and last+1 to optimize strip-like sequences
			
 
				+				if (c + 1 == last)
			
 
				+					fec = 13, last = c;
			
 
				+				if (c == last + 1)
			
 
				+					fec = 14, last = c;
			
 
				+			}
			
 
				+
			
 
				+			*code++ = (unsigned char)((fe << 4) | fec);
			
 
				+
			
 
				+			// note that we need to update the last index since free indices are delta-encoded
			
 
				+			if (fec == 15)
			
 
				+				encodeIndex(data, c, last), last = c;
			
 
				+
			
 
				+			// we only need to push third vertex since first two are likely already in the vertex fifo
			
 
				+			if (fec == 0 || fec >= fecmax)
			
 
				+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
			
 
				+
			
 
				+			// we only need to push two new edges to edge fifo since the third one is already there
			
 
				+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
			
 
				+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next);
			
 
				+			const unsigned int* order = kTriangleIndexOrder[rotation];
			
 
				+
			
 
				+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
			
 
				+
			
 
				+			// if a/b/c are 0/1/2, we emit a reset code
			
 
				+			bool reset = false;
			
 
				+
			
 
				+			if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1)
			
 
				+			{
			
 
				+				reset = true;
			
 
				+				next = 0;
			
 
				+
			
 
				+				// reset vertex fifo to make sure we don't accidentally reference vertices from that in the future
			
 
				+				// this makes sure next continues to get incremented instead of being stuck
			
 
				+				memset(vertexfifo, -1, sizeof(vertexfifo));
			
 
				+			}
			
 
				+
			
 
				+			int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
			
 
				+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
			
 
				+
			
 
				+			// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
			
 
				+			int fea = (a == next) ? (next++, 0) : 15;
			
 
				+			int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15);
			
 
				+			int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15);
			
 
				+
			
 
				+			// we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise
			
 
				+			unsigned char codeaux = (unsigned char)((feb << 4) | fec);
			
 
				+			int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
			
 
				+
			
 
				+			// <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
			
 
				+			if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset)
			
 
				+			{
			
 
				+				*code++ = (unsigned char)((15 << 4) | codeauxindex);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				*code++ = (unsigned char)((15 << 4) | 14 | fea);
			
 
				+				*data++ = codeaux;
			
 
				+			}
			
 
				+
			
 
				+			// note that we need to update the last index since free indices are delta-encoded
			
 
				+			if (fea == 15)
			
 
				+				encodeIndex(data, a, last), last = a;
			
 
				+
			
 
				+			if (feb == 15)
			
 
				+				encodeIndex(data, b, last), last = b;
			
 
				+
			
 
				+			if (fec == 15)
			
 
				+				encodeIndex(data, c, last), last = c;
			
 
				+
			
 
				+			// only push vertices that weren't already in fifo
			
 
				+			if (fea == 0 || fea == 15)
			
 
				+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
			
 
				+
			
 
				+			if (feb == 0 || feb == 15)
			
 
				+				pushVertexFifo(vertexfifo, b, vertexfifooffset);
			
 
				+
			
 
				+			if (fec == 0 || fec == 15)
			
 
				+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
			
 
				+
			
 
				+			// all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles
			
 
				+			pushEdgeFifo(edgefifo, b, a, edgefifooffset);
			
 
				+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
			
 
				+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// make sure we have enough space to write codeaux table
			
 
				+	if (data > data_safe_end)
			
 
				+		return 0;
			
 
				+
			
 
				+	// add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding
			
 
				+	// we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data
			
 
				+	// this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input
			
 
				+	for (size_t i = 0; i < 16; ++i)
			
 
				+	{
			
 
				+		// decoder assumes that table entries never refer to separately encoded indices
			
 
				+		assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf);
			
 
				+
			
 
				+		*data++ = codeaux_table[i];
			
 
				+	}
			
 
				+
			
 
				+	// since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference
			
 
				+	assert(codeaux_table[0] == 0);
			
 
				+
			
 
				+	assert(data >= buffer + index_count / 3 + 16);
			
 
				+	assert(data <= buffer + buffer_size);
			
 
				+
			
 
				+	return data - buffer;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	// compute number of bits required for each index
			
 
				+	unsigned int vertex_bits = 1;
			
 
				+
			
 
				+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
			
 
				+		vertex_bits++;
			
 
				+
			
 
				+	// worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas
			
 
				+	unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7;
			
 
				+
			
 
				+	return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
			
 
				+}
			
 
				+
			
 
				+void meshopt_encodeIndexVersion(int version)
			
 
				+{
			
 
				+	assert(unsigned(version) <= 1);
			
 
				+
			
 
				+	meshopt::gEncodeIndexVersion = version;
			
 
				+}
			
 
				+
			
 
				+int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(index_size == 2 || index_size == 4);
			
 
				+
			
 
				+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
			
 
				+	if (buffer_size < 1 + index_count / 3 + 16)
			
 
				+		return -2;
			
 
				+
			
 
				+	if ((buffer[0] & 0xf0) != kIndexHeader)
			
 
				+		return -1;
			
 
				+
			
 
				+	int version = buffer[0] & 0x0f;
			
 
				+	if (version > 1)
			
 
				+		return -1;
			
 
				+
			
 
				+	EdgeFifo edgefifo;
			
 
				+	memset(edgefifo, -1, sizeof(edgefifo));
			
 
				+
			
 
				+	VertexFifo vertexfifo;
			
 
				+	memset(vertexfifo, -1, sizeof(vertexfifo));
			
 
				+
			
 
				+	size_t edgefifooffset = 0;
			
 
				+	size_t vertexfifooffset = 0;
			
 
				+
			
 
				+	unsigned int next = 0;
			
 
				+	unsigned int last = 0;
			
 
				+
			
 
				+	int fecmax = version >= 1 ? 13 : 15;
			
 
				+
			
 
				+	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
			
 
				+	const unsigned char* code = buffer + 1;
			
 
				+	const unsigned char* data = code + index_count / 3;
			
 
				+	const unsigned char* data_safe_end = buffer + buffer_size - 16;
			
 
				+
			
 
				+	const unsigned char* codeaux_table = data_safe_end;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		// make sure we have enough data to read for a triangle
			
 
				+		// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
			
 
				+		// after this we can be sure we can read without extra bounds checks
			
 
				+		if (data > data_safe_end)
			
 
				+			return -2;
			
 
				+
			
 
				+		unsigned char codetri = *code++;
			
 
				+
			
 
				+		if (codetri < 0xf0)
			
 
				+		{
			
 
				+			int fe = codetri >> 4;
			
 
				+
			
 
				+			// fifo reads are wrapped around 16 entry buffer
			
 
				+			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
			
 
				+			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
			
 
				+
			
 
				+			int fec = codetri & 15;
			
 
				+
			
 
				+			// note: this is the most common path in the entire decoder
			
 
				+			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
			
 
				+			if (fec < fecmax)
			
 
				+			{
			
 
				+				// fifo reads are wrapped around 16 entry buffer
			
 
				+				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
			
 
				+				unsigned int c = (fec == 0) ? next : cf;
			
 
				+
			
 
				+				int fec0 = fec == 0;
			
 
				+				next += fec0;
			
 
				+
			
 
				+				// output triangle
			
 
				+				writeTriangle(destination, i, index_size, a, b, c);
			
 
				+
			
 
				+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
			
 
				+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
			
 
				+
			
 
				+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
			
 
				+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				unsigned int c = 0;
			
 
				+
			
 
				+				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
			
 
				+				// note that we need to update the last index since free indices are delta-encoded
			
 
				+				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
			
 
				+
			
 
				+				// output triangle
			
 
				+				writeTriangle(destination, i, index_size, a, b, c);
			
 
				+
			
 
				+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
			
 
				+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
			
 
				+
			
 
				+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
			
 
				+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// fast path: read codeaux from the table
			
 
				+			if (codetri < 0xfe)
			
 
				+			{
			
 
				+				unsigned char codeaux = codeaux_table[codetri & 15];
			
 
				+
			
 
				+				// note: table can't contain feb/fec=15
			
 
				+				int feb = codeaux >> 4;
			
 
				+				int fec = codeaux & 15;
			
 
				+
			
 
				+				// fifo reads are wrapped around 16 entry buffer
			
 
				+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
			
 
				+				unsigned int a = next++;
			
 
				+
			
 
				+				unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
			
 
				+				unsigned int b = (feb == 0) ? next : bf;
			
 
				+
			
 
				+				int feb0 = feb == 0;
			
 
				+				next += feb0;
			
 
				+
			
 
				+				unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
			
 
				+				unsigned int c = (fec == 0) ? next : cf;
			
 
				+
			
 
				+				int fec0 = fec == 0;
			
 
				+				next += fec0;
			
 
				+
			
 
				+				// output triangle
			
 
				+				writeTriangle(destination, i, index_size, a, b, c);
			
 
				+
			
 
				+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
			
 
				+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
			
 
				+				pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
			
 
				+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
			
 
				+
			
 
				+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
			
 
				+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
			
 
				+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// slow path: read a full byte for codeaux instead of using a table lookup
			
 
				+				unsigned char codeaux = *data++;
			
 
				+
			
 
				+				int fea = codetri == 0xfe ? 0 : 15;
			
 
				+				int feb = codeaux >> 4;
			
 
				+				int fec = codeaux & 15;
			
 
				+
			
 
				+				// reset: codeaux is 0 but encoded as not-a-table
			
 
				+				if (codeaux == 0)
			
 
				+					next = 0;
			
 
				+
			
 
				+				// fifo reads are wrapped around 16 entry buffer
			
 
				+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
			
 
				+				unsigned int a = (fea == 0) ? next++ : 0;
			
 
				+				unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
			
 
				+				unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
			
 
				+
			
 
				+				// note that we need to update the last index since free indices are delta-encoded
			
 
				+				if (fea == 15)
			
 
				+					last = a = decodeIndex(data, last);
			
 
				+
			
 
				+				if (feb == 15)
			
 
				+					last = b = decodeIndex(data, last);
			
 
				+
			
 
				+				if (fec == 15)
			
 
				+					last = c = decodeIndex(data, last);
			
 
				+
			
 
				+				// output triangle
			
 
				+				writeTriangle(destination, i, index_size, a, b, c);
			
 
				+
			
 
				+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
			
 
				+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
			
 
				+				pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
			
 
				+				pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
			
 
				+
			
 
				+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
			
 
				+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
			
 
				+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// we should've read all data bytes and stopped at the boundary between data and codeaux table
			
 
				+	if (data != data_safe_end)
			
 
				+		return -3;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
			
 
				+	if (buffer_size < 1 + index_count + 4)
			
 
				+		return 0;
			
 
				+
			
 
				+	int version = gEncodeIndexVersion;
			
 
				+
			
 
				+	buffer[0] = (unsigned char)(kSequenceHeader | version);
			
 
				+
			
 
				+	unsigned int last[2] = {};
			
 
				+	unsigned int current = 0;
			
 
				+
			
 
				+	unsigned char* data = buffer + 1;
			
 
				+	unsigned char* data_safe_end = buffer + buffer_size - 4;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		// make sure we have enough data to write
			
 
				+		// each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end
			
 
				+		// after this we can be sure we can write without extra bounds checks
			
 
				+		if (data >= data_safe_end)
			
 
				+			return 0;
			
 
				+
			
 
				+		unsigned int index = indices[i];
			
 
				+
			
 
				+		// this is a heuristic that switches between baselines when the delta grows too large
			
 
				+		// we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index
			
 
				+		// for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily
			
 
				+		int cd = int(index - last[current]);
			
 
				+		current ^= ((cd < 0 ? -cd : cd) >= 30);
			
 
				+
			
 
				+		// encode delta from the last index
			
 
				+		unsigned int d = index - last[current];
			
 
				+		unsigned int v = (d << 1) ^ (int(d) >> 31);
			
 
				+
			
 
				+		// note: low bit encodes the index of the last baseline which will be used for reconstruction
			
 
				+		encodeVByte(data, (v << 1) | current);
			
 
				+
			
 
				+		// update last for the next iteration that uses it
			
 
				+		last[current] = index;
			
 
				+	}
			
 
				+
			
 
				+	// make sure we have enough space to write tail
			
 
				+	if (data > data_safe_end)
			
 
				+		return 0;
			
 
				+
			
 
				+	for (int k = 0; k < 4; ++k)
			
 
				+		*data++ = 0;
			
 
				+
			
 
				+	return data - buffer;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	// compute number of bits required for each index
			
 
				+	unsigned int vertex_bits = 1;
			
 
				+
			
 
				+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
			
 
				+		vertex_bits++;
			
 
				+
			
 
				+	// worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit
			
 
				+	unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7;
			
 
				+
			
 
				+	return 1 + index_count * vertex_groups + 4;
			
 
				+}
			
 
				+
			
 
				+int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
			
 
				+	if (buffer_size < 1 + index_count + 4)
			
 
				+		return -2;
			
 
				+
			
 
				+	if ((buffer[0] & 0xf0) != kSequenceHeader)
			
 
				+		return -1;
			
 
				+
			
 
				+	int version = buffer[0] & 0x0f;
			
 
				+	if (version > 1)
			
 
				+		return -1;
			
 
				+
			
 
				+	const unsigned char* data = buffer + 1;
			
 
				+	const unsigned char* data_safe_end = buffer + buffer_size - 4;
			
 
				+
			
 
				+	unsigned int last[2] = {};
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		// make sure we have enough data to read
			
 
				+		// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
			
 
				+		// after this we can be sure we can read without extra bounds checks
			
 
				+		if (data >= data_safe_end)
			
 
				+			return -2;
			
 
				+
			
 
				+		unsigned int v = decodeVByte(data);
			
 
				+
			
 
				+		// decode the index of the last baseline
			
 
				+		unsigned int current = v & 1;
			
 
				+		v >>= 1;
			
 
				+
			
 
				+		// reconstruct index as a delta
			
 
				+		unsigned int d = (v >> 1) ^ -int(v & 1);
			
 
				+		unsigned int index = last[current] + d;
			
 
				+
			
 
				+		// update last for the next iteration that uses it
			
 
				+		last[current] = index;
			
 
				+
			
 
				+		if (index_size == 2)
			
 
				+		{
			
 
				+			static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			static_cast<unsigned int*>(destination)[i] = index;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// we should've read all data bytes and stopped at the boundary between data and tail
			
 
				+	if (data != data_safe_end)
			
 
				+		return -3;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/include/meshoptimizer/indexgenerator.cpp
+++ b/include/meshoptimizer/indexgenerator.cpp
@@ -0,0 +1,675 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// This work is based on:
			
 
				+// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
			
 
				+// John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len)
			
 
				+{
			
 
				+	// MurmurHash2
			
 
				+	const unsigned int m = 0x5bd1e995;
			
 
				+	const int r = 24;
			
 
				+
			
 
				+	while (len >= 4)
			
 
				+	{
			
 
				+		unsigned int k = *reinterpret_cast<const unsigned int*>(key);
			
 
				+
			
 
				+		k *= m;
			
 
				+		k ^= k >> r;
			
 
				+		k *= m;
			
 
				+
			
 
				+		h *= m;
			
 
				+		h ^= k;
			
 
				+
			
 
				+		key += 4;
			
 
				+		len -= 4;
			
 
				+	}
			
 
				+
			
 
				+	return h;
			
 
				+}
			
 
				+
			
 
				+struct VertexHasher
			
 
				+{
			
 
				+	const unsigned char* vertices;
			
 
				+	size_t vertex_size;
			
 
				+	size_t vertex_stride;
			
 
				+
			
 
				+	size_t hash(unsigned int index) const
			
 
				+	{
			
 
				+		return hashUpdate4(0, vertices + index * vertex_stride, vertex_size);
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned int lhs, unsigned int rhs) const
			
 
				+	{
			
 
				+		return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+struct VertexStreamHasher
			
 
				+{
			
 
				+	const meshopt_Stream* streams;
			
 
				+	size_t stream_count;
			
 
				+
			
 
				+	size_t hash(unsigned int index) const
			
 
				+	{
			
 
				+		unsigned int h = 0;
			
 
				+
			
 
				+		for (size_t i = 0; i < stream_count; ++i)
			
 
				+		{
			
 
				+			const meshopt_Stream& s = streams[i];
			
 
				+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
			
 
				+
			
 
				+			h = hashUpdate4(h, data + index * s.stride, s.size);
			
 
				+		}
			
 
				+
			
 
				+		return h;
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned int lhs, unsigned int rhs) const
			
 
				+	{
			
 
				+		for (size_t i = 0; i < stream_count; ++i)
			
 
				+		{
			
 
				+			const meshopt_Stream& s = streams[i];
			
 
				+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
			
 
				+
			
 
				+			if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0)
			
 
				+				return false;
			
 
				+		}
			
 
				+
			
 
				+		return true;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+struct EdgeHasher
			
 
				+{
			
 
				+	const unsigned int* remap;
			
 
				+
			
 
				+	size_t hash(unsigned long long edge) const
			
 
				+	{
			
 
				+		unsigned int e0 = unsigned(edge >> 32);
			
 
				+		unsigned int e1 = unsigned(edge);
			
 
				+
			
 
				+		unsigned int h1 = remap[e0];
			
 
				+		unsigned int h2 = remap[e1];
			
 
				+
			
 
				+		const unsigned int m = 0x5bd1e995;
			
 
				+
			
 
				+		// MurmurHash64B finalizer
			
 
				+		h1 ^= h2 >> 18;
			
 
				+		h1 *= m;
			
 
				+		h2 ^= h1 >> 22;
			
 
				+		h2 *= m;
			
 
				+		h1 ^= h2 >> 17;
			
 
				+		h1 *= m;
			
 
				+		h2 ^= h1 >> 19;
			
 
				+		h2 *= m;
			
 
				+
			
 
				+		return h2;
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned long long lhs, unsigned long long rhs) const
			
 
				+	{
			
 
				+		unsigned int l0 = unsigned(lhs >> 32);
			
 
				+		unsigned int l1 = unsigned(lhs);
			
 
				+
			
 
				+		unsigned int r0 = unsigned(rhs >> 32);
			
 
				+		unsigned int r1 = unsigned(rhs);
			
 
				+
			
 
				+		return remap[l0] == remap[r0] && remap[l1] == remap[r1];
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+static size_t hashBuckets(size_t count)
			
 
				+{
			
 
				+	size_t buckets = 1;
			
 
				+	while (buckets < count + count / 4)
			
 
				+		buckets *= 2;
			
 
				+
			
 
				+	return buckets;
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename Hash>
			
 
				+static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
			
 
				+{
			
 
				+	assert(buckets > 0);
			
 
				+	assert((buckets & (buckets - 1)) == 0);
			
 
				+
			
 
				+	size_t hashmod = buckets - 1;
			
 
				+	size_t bucket = hash.hash(key) & hashmod;
			
 
				+
			
 
				+	for (size_t probe = 0; probe <= hashmod; ++probe)
			
 
				+	{
			
 
				+		T& item = table[bucket];
			
 
				+
			
 
				+		if (item == empty)
			
 
				+			return &item;
			
 
				+
			
 
				+		if (hash.equal(item, key))
			
 
				+			return &item;
			
 
				+
			
 
				+		// hash collision, quadratic probing
			
 
				+		bucket = (bucket + probe + 1) & hashmod;
			
 
				+	}
			
 
				+
			
 
				+	assert(false && "Hash table is full"); // unreachable
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
			
 
				+{
			
 
				+	VertexHasher vertex_hasher = {reinterpret_cast<const unsigned char*>(vertex_positions), 3 * sizeof(float), vertex_positions_stride};
			
 
				+
			
 
				+	size_t vertex_table_size = hashBuckets(vertex_count);
			
 
				+	unsigned int* vertex_table = allocator.allocate<unsigned int>(vertex_table_size);
			
 
				+	memset(vertex_table, -1, vertex_table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = unsigned(i);
			
 
				+		unsigned int* entry = hashLookup(vertex_table, vertex_table_size, vertex_hasher, index, ~0u);
			
 
				+
			
 
				+		if (*entry == ~0u)
			
 
				+			*entry = index;
			
 
				+
			
 
				+		remap[index] = *entry;
			
 
				+	}
			
 
				+
			
 
				+	allocator.deallocate(vertex_table);
			
 
				+}
			
 
				+
			
 
				+template <size_t BlockSize>
			
 
				+static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
			
 
				+{
			
 
				+	size_t block_size = BlockSize == 0 ? vertex_size : BlockSize;
			
 
				+	assert(block_size == vertex_size);
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		if (remap[i] != ~0u)
			
 
				+		{
			
 
				+			assert(remap[i] < vertex_count);
			
 
				+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * block_size, static_cast<const unsigned char*>(vertices) + i * block_size, block_size);
			
 
				+		}
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(indices || index_count == vertex_count);
			
 
				+	assert(!indices || index_count % 3 == 0);
			
 
				+	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	memset(destination, -1, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
			
 
				+
			
 
				+	size_t table_size = hashBuckets(vertex_count);
			
 
				+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
			
 
				+	memset(table, -1, table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	unsigned int next_vertex = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices ? indices[i] : unsigned(i);
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		if (destination[index] == ~0u)
			
 
				+		{
			
 
				+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
			
 
				+
			
 
				+			if (*entry == ~0u)
			
 
				+			{
			
 
				+				*entry = index;
			
 
				+
			
 
				+				destination[index] = next_vertex++;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				assert(destination[*entry] != ~0u);
			
 
				+
			
 
				+				destination[index] = destination[*entry];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	assert(next_vertex <= vertex_count);
			
 
				+
			
 
				+	return next_vertex;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(indices || index_count == vertex_count);
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(stream_count > 0 && stream_count <= 16);
			
 
				+
			
 
				+	for (size_t i = 0; i < stream_count; ++i)
			
 
				+	{
			
 
				+		assert(streams[i].size > 0 && streams[i].size <= 256);
			
 
				+		assert(streams[i].size <= streams[i].stride);
			
 
				+	}
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	memset(destination, -1, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	VertexStreamHasher hasher = {streams, stream_count};
			
 
				+
			
 
				+	size_t table_size = hashBuckets(vertex_count);
			
 
				+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
			
 
				+	memset(table, -1, table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	unsigned int next_vertex = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices ? indices[i] : unsigned(i);
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		if (destination[index] == ~0u)
			
 
				+		{
			
 
				+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
			
 
				+
			
 
				+			if (*entry == ~0u)
			
 
				+			{
			
 
				+				*entry = index;
			
 
				+
			
 
				+				destination[index] = next_vertex++;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				assert(destination[*entry] != ~0u);
			
 
				+
			
 
				+				destination[index] = destination[*entry];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	assert(next_vertex <= vertex_count);
			
 
				+
			
 
				+	return next_vertex;
			
 
				+}
			
 
				+
			
 
				+void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	// support in-place remap
			
 
				+	if (destination == vertices)
			
 
				+	{
			
 
				+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
			
 
				+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
			
 
				+		vertices = vertices_copy;
			
 
				+	}
			
 
				+
			
 
				+	// specialize the loop for common vertex sizes to ensure memcpy is compiled as an inlined intrinsic
			
 
				+	switch (vertex_size)
			
 
				+	{
			
 
				+	case 4:
			
 
				+		return remapVertices<4>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				+
			
 
				+	case 8:
			
 
				+		return remapVertices<8>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				+
			
 
				+	case 12:
			
 
				+		return remapVertices<12>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				+
			
 
				+	case 16:
			
 
				+		return remapVertices<16>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				+
			
 
				+	default:
			
 
				+		return remapVertices<0>(destination, vertices, vertex_count, vertex_size, remap);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap)
			
 
				+{
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices ? indices[i] : unsigned(i);
			
 
				+		assert(remap[index] != ~0u);
			
 
				+
			
 
				+		destination[i] = remap[index];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(indices);
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				+	assert(vertex_size <= vertex_stride);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memset(remap, -1, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
			
 
				+
			
 
				+	size_t table_size = hashBuckets(vertex_count);
			
 
				+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
			
 
				+	memset(table, -1, table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		if (remap[index] == ~0u)
			
 
				+		{
			
 
				+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
			
 
				+
			
 
				+			if (*entry == ~0u)
			
 
				+				*entry = index;
			
 
				+
			
 
				+			remap[index] = *entry;
			
 
				+		}
			
 
				+
			
 
				+		destination[i] = remap[index];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(indices);
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(stream_count > 0 && stream_count <= 16);
			
 
				+
			
 
				+	for (size_t i = 0; i < stream_count; ++i)
			
 
				+	{
			
 
				+		assert(streams[i].size > 0 && streams[i].size <= 256);
			
 
				+		assert(streams[i].size <= streams[i].stride);
			
 
				+	}
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memset(remap, -1, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	VertexStreamHasher hasher = {streams, stream_count};
			
 
				+
			
 
				+	size_t table_size = hashBuckets(vertex_count);
			
 
				+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
			
 
				+	memset(table, -1, table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		if (remap[index] == ~0u)
			
 
				+		{
			
 
				+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
			
 
				+
			
 
				+			if (*entry == ~0u)
			
 
				+				*entry = index;
			
 
				+
			
 
				+			remap[index] = *entry;
			
 
				+		}
			
 
				+
			
 
				+		destination[i] = remap[index];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	static const int next[4] = {1, 2, 0, 1};
			
 
				+
			
 
				+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
			
 
				+
			
 
				+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
			
 
				+	EdgeHasher edge_hasher = {remap};
			
 
				+
			
 
				+	size_t edge_table_size = hashBuckets(index_count);
			
 
				+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
			
 
				+	unsigned int* edge_vertex_table = allocator.allocate<unsigned int>(edge_table_size);
			
 
				+
			
 
				+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
			
 
				+	memset(edge_vertex_table, -1, edge_table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+			unsigned int i2 = indices[i + next[e + 1]];
			
 
				+			assert(i0 < vertex_count && i1 < vertex_count && i2 < vertex_count);
			
 
				+
			
 
				+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
			
 
				+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
			
 
				+
			
 
				+			if (*entry == ~0ull)
			
 
				+			{
			
 
				+				*entry = edge;
			
 
				+
			
 
				+				// store vertex opposite to the edge
			
 
				+				edge_vertex_table[entry - edge_table] = i2;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// build resulting index buffer: 6 indices for each input triangle
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int patch[6];
			
 
				+
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+			assert(i0 < vertex_count && i1 < vertex_count);
			
 
				+
			
 
				+			// note: this refers to the opposite edge!
			
 
				+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
			
 
				+			unsigned long long* oppe = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
			
 
				+
			
 
				+			patch[e * 2 + 0] = i0;
			
 
				+			patch[e * 2 + 1] = (*oppe == ~0ull) ? i0 : edge_vertex_table[oppe - edge_table];
			
 
				+		}
			
 
				+
			
 
				+		memcpy(destination + i * 2, patch, sizeof(patch));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	static const int next[3] = {1, 2, 0};
			
 
				+
			
 
				+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
			
 
				+
			
 
				+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
			
 
				+	EdgeHasher edge_hasher = {remap};
			
 
				+
			
 
				+	size_t edge_table_size = hashBuckets(index_count);
			
 
				+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
			
 
				+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+			assert(i0 < vertex_count && i1 < vertex_count);
			
 
				+
			
 
				+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
			
 
				+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
			
 
				+
			
 
				+			if (*entry == ~0ull)
			
 
				+				*entry = edge;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// build resulting index buffer: 12 indices for each input triangle
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int patch[12];
			
 
				+
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+			assert(i0 < vertex_count && i1 < vertex_count);
			
 
				+
			
 
				+			// note: this refers to the opposite edge!
			
 
				+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
			
 
				+			unsigned long long oppe = *hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
			
 
				+
			
 
				+			// use the same edge if opposite edge doesn't exist (border)
			
 
				+			oppe = (oppe == ~0ull) ? edge : oppe;
			
 
				+
			
 
				+			// triangle index (0, 1, 2)
			
 
				+			patch[e] = i0;
			
 
				+
			
 
				+			// opposite edge (3, 4; 5, 6; 7, 8)
			
 
				+			patch[3 + e * 2 + 0] = unsigned(oppe);
			
 
				+			patch[3 + e * 2 + 1] = unsigned(oppe >> 32);
			
 
				+
			
 
				+			// dominant vertex (9, 10, 11)
			
 
				+			patch[9 + e] = remap[i0];
			
 
				+		}
			
 
				+
			
 
				+		memcpy(destination + i * 4, patch, sizeof(patch));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memset(remap, -1, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	// compute vertex valence; this is used to prioritize least used corner
			
 
				+	// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
			
 
				+	unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
			
 
				+	memset(valence, 0, vertex_count);
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		valence[index]++;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int reorder_offset = 0;
			
 
				+
			
 
				+	// assign provoking vertices; leave the rest for the next pass
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		// try to rotate triangle such that provoking vertex hasn't been seen before
			
 
				+		// if multiple vertices are new, prioritize the one with least valence
			
 
				+		// this reduces the risk that a future triangle will have all three vertices seen
			
 
				+		unsigned int va = remap[a] == ~0u ? valence[a] : ~0u;
			
 
				+		unsigned int vb = remap[b] == ~0u ? valence[b] : ~0u;
			
 
				+		unsigned int vc = remap[c] == ~0u ? valence[c] : ~0u;
			
 
				+
			
 
				+		if (vb != ~0u && vb <= va && vb <= vc)
			
 
				+		{
			
 
				+			// abc -> bca
			
 
				+			unsigned int t = a;
			
 
				+			a = b, b = c, c = t;
			
 
				+		}
			
 
				+		else if (vc != ~0u && vc <= va && vc <= vb)
			
 
				+		{
			
 
				+			// abc -> cab
			
 
				+			unsigned int t = c;
			
 
				+			c = b, b = a, a = t;
			
 
				+		}
			
 
				+
			
 
				+		unsigned int newidx = reorder_offset;
			
 
				+
			
 
				+		// now remap[a] = ~0u or all three vertices are old
			
 
				+		// recording remap[a] makes it possible to remap future references to the same index, conserving space
			
 
				+		if (remap[a] == ~0u)
			
 
				+			remap[a] = newidx;
			
 
				+
			
 
				+		// we need to clone the provoking vertex to get a unique index
			
 
				+		// if all three are used the choice is arbitrary since no future triangle will be able to reuse any of these
			
 
				+		reorder[reorder_offset++] = a;
			
 
				+
			
 
				+		// note: first vertex is final, the other two will be fixed up in next pass
			
 
				+		destination[i + 0] = newidx;
			
 
				+		destination[i + 1] = b;
			
 
				+		destination[i + 2] = c;
			
 
				+
			
 
				+		// update vertex valences for corner heuristic
			
 
				+		valence[a]--;
			
 
				+		valence[b]--;
			
 
				+		valence[c]--;
			
 
				+	}
			
 
				+
			
 
				+	// remap or clone non-provoking vertices (iterating to skip provoking vertices)
			
 
				+	int step = 1;
			
 
				+
			
 
				+	for (size_t i = 1; i < index_count; i += step, step ^= 3)
			
 
				+	{
			
 
				+		unsigned int index = destination[i];
			
 
				+
			
 
				+		if (remap[index] == ~0u)
			
 
				+		{
			
 
				+			// we haven't seen the vertex before as a provoking vertex
			
 
				+			// to maintain the reference to the original vertex we need to clone it
			
 
				+			unsigned int newidx = reorder_offset;
			
 
				+
			
 
				+			remap[index] = newidx;
			
 
				+			reorder[reorder_offset++] = index;
			
 
				+		}
			
 
				+
			
 
				+		destination[i] = remap[index];
			
 
				+	}
			
 
				+
			
 
				+	assert(reorder_offset <= vertex_count + index_count / 3);
			
 
				+	return reorder_offset;
			
 
				+}
			
--- a/include/meshoptimizer/meshoptimizer.h
+++ b/include/meshoptimizer/meshoptimizer.h
@@ -0,0 +1,1138 @@
 
				+/**
			
 
				+ * meshoptimizer - version 0.22
			
 
				+ *
			
 
				+ * Copyright (C) 2016-2024, by Arseny Kapoulkine ([email protected])
			
 
				+ * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
			
 
				+ *
			
 
				+ * This library is distributed under the MIT License. See notice at the end of this file.
			
 
				+ */
			
 
				+#pragma once
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <stddef.h>
			
 
				+
			
 
				+/* Version macro; major * 1000 + minor * 10 + patch */
			
 
				+#define MESHOPTIMIZER_VERSION 220 /* 0.22 */
			
 
				+
			
 
				+/* If no API is defined, assume default */
			
 
				+#ifndef MESHOPTIMIZER_API
			
 
				+#define MESHOPTIMIZER_API
			
 
				+#endif
			
 
				+
			
 
				+/* Set the calling-convention for alloc/dealloc function pointers */
			
 
				+#ifndef MESHOPTIMIZER_ALLOC_CALLCONV
			
 
				+#ifdef _MSC_VER
			
 
				+#define MESHOPTIMIZER_ALLOC_CALLCONV __cdecl
			
 
				+#else
			
 
				+#define MESHOPTIMIZER_ALLOC_CALLCONV
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
			
 
				+#ifndef MESHOPTIMIZER_EXPERIMENTAL
			
 
				+#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
			
 
				+#endif
			
 
				+
			
 
				+/* C interface */
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * Vertex attribute stream
			
 
				+ * Each element takes size bytes, beginning at data, with stride controlling the spacing between successive elements (stride >= size).
			
 
				+ */
			
 
				+struct meshopt_Stream
			
 
				+{
			
 
				+	const void* data;
			
 
				+	size_t size;
			
 
				+	size_t stride;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
			
 
				+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
			
 
				+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
			
 
				+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting remap table (vertex_count elements)
			
 
				+ * indices can be NULL if the input is unindexed
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
			
 
				+
			
 
				+/**
			
 
				+ * Generates a vertex remap table from multiple vertex streams and an optional index buffer and returns number of unique vertices
			
 
				+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
			
 
				+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
			
 
				+ * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream.
			
 
				+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting remap table (vertex_count elements)
			
 
				+ * indices can be NULL if the input is unindexed
			
 
				+ * stream_count must be <= 16
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
			
 
				+
			
 
				+/**
			
 
				+ * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting vertex buffer (unique_vertex_count elements, returned by meshopt_generateVertexRemap)
			
 
				+ * vertex_count should be the initial vertex count and not the value returned by meshopt_generateVertexRemap
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap);
			
 
				+
			
 
				+/**
			
 
				+ * Generate index buffer from the source index buffer and remap table generated by meshopt_generateVertexRemap
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ * indices can be NULL if the input is unindexed
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap);
			
 
				+
			
 
				+/**
			
 
				+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
			
 
				+ * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer.
			
 
				+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
			
 
				+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
			
 
				+ * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer.
			
 
				+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
			
 
				+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ * stream_count must be <= 16
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
			
 
				+
			
 
				+/**
			
 
				+ * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
			
 
				+ * Each triangle is converted into a 6-vertex patch with the following layout:
			
 
				+ * - 0, 2, 4: original triangle vertices
			
 
				+ * - 1, 3, 5: vertices adjacent to edges 02, 24 and 40
			
 
				+ * The resulting patch can be rendered with geometry shaders using e.g. VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY.
			
 
				+ * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count*2 elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Generate index buffer that can be used for PN-AEN tessellation with crack-free displacement
			
 
				+ * Each triangle is converted into a 12-vertex patch with the following layout:
			
 
				+ * - 0, 1, 2: original triangle vertices
			
 
				+ * - 3, 4: opposing edge for edge 0, 1
			
 
				+ * - 5, 6: opposing edge for edge 1, 2
			
 
				+ * - 7, 8: opposing edge for edge 2, 0
			
 
				+ * - 9, 10, 11: dominant vertices for corners 0, 1, 2
			
 
				+ * The resulting patch can be rendered with hardware tessellation using PN-AEN and displacement mapping.
			
 
				+ * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count*4 elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Experimental: Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
			
 
				+ * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using nointerpolate attribute.
			
 
				+ * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader.
			
 
				+ * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data.
			
 
				+ * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering.
			
 
				+ * For maximum efficiency the input index buffer should be optimized for vertex cache first.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ * reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex transform cache optimizer
			
 
				+ * Reorders indices to reduce the number of GPU vertex shader invocations
			
 
				+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex transform cache optimizer for strip-like caches
			
 
				+ * Produces inferior results to meshopt_optimizeVertexCache from the GPU vertex cache perspective
			
 
				+ * However, the resulting index order is more optimal if the goal is to reduce the triangle strip length or improve compression efficiency
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex transform cache optimizer for FIFO caches
			
 
				+ * Reorders indices to reduce the number of GPU vertex shader invocations
			
 
				+ * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
			
 
				+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ * cache_size should be less than the actual GPU cache size to avoid cache thrashing
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
			
 
				+
			
 
				+/**
			
 
				+ * Overdraw optimizer
			
 
				+ * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
			
 
				+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex fetch cache optimizer
			
 
				+ * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
			
 
				+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
			
 
				+ * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting vertex buffer (vertex_count elements)
			
 
				+ * indices is used both as an input and as an output index buffer
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex fetch cache optimizer
			
 
				+ * Generates vertex remap to reduce the amount of GPU memory fetches during vertex processing
			
 
				+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
			
 
				+ * The resulting remap table should be used to reorder vertex/index buffers using meshopt_remapVertexBuffer/meshopt_remapIndexBuffer
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting remap table (vertex_count elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Index buffer encoder
			
 
				+ * Encodes index data into an array of bytes that is generally much smaller (<1.5 bytes/triangle) and compresses better (<1 bytes/triangle) compared to original.
			
 
				+ * Input index buffer must represent a triangle list.
			
 
				+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
			
 
				+ * For maximum efficiency the index buffer being encoded has to be optimized for vertex cache and vertex fetch first.
			
 
				+ *
			
 
				+ * buffer must contain enough space for the encoded index buffer (use meshopt_encodeIndexBufferBound to compute worst case size)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Set index encoder format version
			
 
				+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
			
 
				+
			
 
				+/**
			
 
				+ * Index buffer decoder
			
 
				+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexBuffer
			
 
				+ * Returns 0 if decoding was successful, and an error code otherwise
			
 
				+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
			
 
				+
			
 
				+/**
			
 
				+ * Index sequence encoder
			
 
				+ * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
			
 
				+ * Input index sequence can represent arbitrary topology; for triangle lists meshopt_encodeIndexBuffer is likely to be better.
			
 
				+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
			
 
				+ *
			
 
				+ * buffer must contain enough space for the encoded index sequence (use meshopt_encodeIndexSequenceBound to compute worst case size)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Index sequence decoder
			
 
				+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexSequence
			
 
				+ * Returns 0 if decoding was successful, and an error code otherwise
			
 
				+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index sequence (index_count elements)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex buffer encoder
			
 
				+ * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original.
			
 
				+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
			
 
				+ * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
			
 
				+ * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
			
 
				+ * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
			
 
				+ *
			
 
				+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
			
 
				+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
			
 
				+
			
 
				+/**
			
 
				+ * Set vertex encoder format version
			
 
				+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex buffer decoder
			
 
				+ * Decodes vertex data from an array of bytes generated by meshopt_encodeVertexBuffer
			
 
				+ * Returns 0 if decoding was successful, and an error code otherwise
			
 
				+ * The decoder is safe to use for untrusted input, but it may produce garbage data.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex buffer filters
			
 
				+ * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
			
 
				+ *
			
 
				+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
			
 
				+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
			
 
				+ *
			
 
				+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
			
 
				+ * Each component is stored as an 16-bit integer; stride must be equal to 8.
			
 
				+ *
			
 
				+ * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
			
 
				+ * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
			
 
				+MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
			
 
				+MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
			
 
				+
			
 
				+/**
			
 
				+ * Vertex buffer filter encoders
			
 
				+ * These functions can be used to encode data in a format that meshopt_decodeFilter can decode
			
 
				+ *
			
 
				+ * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output.
			
 
				+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
			
 
				+ * Input data must contain 4 floats for every vector (count*4 total).
			
 
				+ *
			
 
				+ * meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding.
			
 
				+ * Each component is stored as an 16-bit integer; stride must be equal to 8.
			
 
				+ * Input data must contain 4 floats for every quaternion (count*4 total).
			
 
				+ *
			
 
				+ * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
			
 
				+ * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
			
 
				+ * Input data must contain stride/4 floats for every vector (count*stride/4 total).
			
 
				+ */
			
 
				+enum meshopt_EncodeExpMode
			
 
				+{
			
 
				+	/* When encoding exponents, use separate values for each component (maximum quality) */
			
 
				+	meshopt_EncodeExpSeparate,
			
 
				+	/* When encoding exponents, use shared value for all components of each vector (better compression) */
			
 
				+	meshopt_EncodeExpSharedVector,
			
 
				+	/* When encoding exponents, use shared value for each component of all vectors (best compression) */
			
 
				+	meshopt_EncodeExpSharedComponent,
			
 
				+	/* Experimental: When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
			
 
				+	meshopt_EncodeExpClamped,
			
 
				+};
			
 
				+
			
 
				+MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				+MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				+MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
			
 
				+
			
 
				+/**
			
 
				+ * Simplification options
			
 
				+ */
			
 
				+enum
			
 
				+{
			
 
				+	/* Do not move vertices that are located on the topological border (vertices on triangle edges that don't have a paired triangle). Useful for simplifying portions of the larger mesh. */
			
 
				+	meshopt_SimplifyLockBorder = 1 << 0,
			
 
				+	/* Improve simplification performance assuming input indices are a sparse subset of the mesh. Note that error becomes relative to subset extents. */
			
 
				+	meshopt_SimplifySparse = 1 << 1,
			
 
				+	/* Treat error limit and resulting error as absolute instead of relative to mesh extents. */
			
 
				+	meshopt_SimplifyErrorAbsolute = 1 << 2,
			
 
				+	/* Experimental: remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
			
 
				+	meshopt_SimplifyPrune = 1 << 3,
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Mesh simplifier
			
 
				+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
			
 
				+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
			
 
				+ * If not all attributes from the input mesh are required, it's recommended to reindex the mesh without them prior to simplification.
			
 
				+ * Returns the number of indices after simplification, with destination containing new index data
			
 
				+ * The resulting index buffer references vertices from the original vertex buffer.
			
 
				+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				+ *
			
 
				+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
			
 
				+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
			
 
				+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				+
			
 
				+/**
			
 
				+ * Experimental: Mesh simplifier with attribute metric
			
 
				+ * The algorithm enhances meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
			
 
				+ * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
			
 
				+ *
			
 
				+ * vertex_attributes should have attribute_count floats for each vertex
			
 
				+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
			
 
				+ * attribute_count must be <= 32
			
 
				+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				+
			
 
				+/**
			
 
				+ * Experimental: Mesh simplifier (sloppy)
			
 
				+ * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
			
 
				+ * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
			
 
				+ * Returns the number of indices after simplification, with destination containing new index data
			
 
				+ * The resulting index buffer references vertices from the original vertex buffer.
			
 
				+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				+ *
			
 
				+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
			
 
				+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
			
 
				+
			
 
				+/**
			
 
				+ * Experimental: Point cloud simplifier
			
 
				+ * Reduces the number of points in the cloud to reach the given target
			
 
				+ * Returns the number of points after simplification, with destination containing new index data
			
 
				+ * The resulting index buffer references vertices from the original vertex buffer.
			
 
				+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				+ *
			
 
				+ * destination must contain enough space for the target index buffer (target_vertex_count elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
			
 
				+ * color_weight determines relative priority of color wrt position; 1.0 is a safe default
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
			
 
				+
			
 
				+/**
			
 
				+ * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
			
 
				+ *
			
 
				+ * Absolute error must be *divided* by the scaling factor before passing it to meshopt_simplify as target_error
			
 
				+ * Relative error returned by meshopt_simplify via result_error must be *multiplied* by the scaling factor to get absolute error.
			
 
				+ */
			
 
				+MESHOPTIMIZER_API float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Mesh stripifier
			
 
				+ * Converts a previously vertex cache optimized triangle list to triangle strip, stitching strips using restart index or degenerate triangles
			
 
				+ * Returns the number of indices in the resulting strip, with destination containing new index data
			
 
				+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
			
 
				+ * Using restart indices can result in ~10% smaller index buffers, but on some GPUs restart indices may result in decreased performance.
			
 
				+ *
			
 
				+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_stripifyBound
			
 
				+ * restart_index should be 0xffff or 0xffffffff depending on index size, or 0 to use degenerate triangles
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index);
			
 
				+MESHOPTIMIZER_API size_t meshopt_stripifyBound(size_t index_count);
			
 
				+
			
 
				+/**
			
 
				+ * Mesh unstripifier
			
 
				+ * Converts a triangle strip to a triangle list
			
 
				+ * Returns the number of indices in the resulting list, with destination containing new index data
			
 
				+ *
			
 
				+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_unstripifyBound
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index);
			
 
				+MESHOPTIMIZER_API size_t meshopt_unstripifyBound(size_t index_count);
			
 
				+
			
 
				+struct meshopt_VertexCacheStatistics
			
 
				+{
			
 
				+	unsigned int vertices_transformed;
			
 
				+	unsigned int warps_executed;
			
 
				+	float acmr; /* transformed vertices / triangle count; best case 0.5, worst case 3.0, optimum depends on topology */
			
 
				+	float atvr; /* transformed vertices / vertex count; best case 1.0, worst case 6.0, optimum is 1.0 (each vertex is transformed once) */
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Vertex transform cache analyzer
			
 
				+ * Returns cache hit statistics using a simplified FIFO model
			
 
				+ * Results may not match actual GPU performance
			
 
				+ */
			
 
				+MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
			
 
				+
			
 
				+struct meshopt_OverdrawStatistics
			
 
				+{
			
 
				+	unsigned int pixels_covered;
			
 
				+	unsigned int pixels_shaded;
			
 
				+	float overdraw; /* shaded pixels / covered pixels; best case 1.0 */
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Overdraw analyzer
			
 
				+ * Returns overdraw statistics using a software rasterizer
			
 
				+ * Results may not match actual GPU performance
			
 
				+ *
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+struct meshopt_VertexFetchStatistics
			
 
				+{
			
 
				+	unsigned int bytes_fetched;
			
 
				+	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Vertex fetch cache analyzer
			
 
				+ * Returns cache hit statistics using a simplified direct mapped model
			
 
				+ * Results may not match actual GPU performance
			
 
				+ */
			
 
				+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
			
 
				+
			
 
				+/**
			
 
				+ * Meshlet is a small mesh cluster (subset) that consists of:
			
 
				+ * - triangles, an 8-bit micro triangle (index) buffer, that for each triangle specifies three local vertices to use;
			
 
				+ * - vertices, a 32-bit vertex indirection buffer, that for each local vertex specifies which mesh vertex to fetch vertex attributes from.
			
 
				+ *
			
 
				+ * For efficiency, meshlet triangles and vertices are packed into two large arrays; this structure contains offsets and counts to access the data.
			
 
				+ */
			
 
				+struct meshopt_Meshlet
			
 
				+{
			
 
				+	/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
			
 
				+	unsigned int vertex_offset;
			
 
				+	unsigned int triangle_offset;
			
 
				+
			
 
				+	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
			
 
				+	unsigned int vertex_count;
			
 
				+	unsigned int triangle_count;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Meshlet builder
			
 
				+ * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
			
 
				+ * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
			
 
				+ * When targeting mesh shading hardware, for maximum efficiency meshlets should be further optimized using meshopt_optimizeMeshlet.
			
 
				+ * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
			
 
				+ * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
			
 
				+ *
			
 
				+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
			
 
				+ * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
			
 
				+ * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4)
			
 
				+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
			
 
				+ */
			
 
				+MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
			
 
				+MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
			
 
				+MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
			
 
				+
			
 
				+/**
			
 
				+ * Experimental: Meshlet optimizer
			
 
				+ * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
			
 
				+ *
			
 
				+ * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
			
 
				+ * need to be computed from meshlet's vertex_offset and triangle_offset
			
 
				+ * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512)
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
			
 
				+
			
 
				+struct meshopt_Bounds
			
 
				+{
			
 
				+	/* bounding sphere, useful for frustum and occlusion culling */
			
 
				+	float center[3];
			
 
				+	float radius;
			
 
				+
			
 
				+	/* normal cone, useful for backface culling */
			
 
				+	float cone_apex[3];
			
 
				+	float cone_axis[3];
			
 
				+	float cone_cutoff; /* = cos(angle/2) */
			
 
				+
			
 
				+	/* normal cone axis and cutoff, stored in 8-bit SNORM format; decode using x/127.0 */
			
 
				+	signed char cone_axis_s8[3];
			
 
				+	signed char cone_cutoff_s8;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Cluster bounds generator
			
 
				+ * Creates bounding volumes that can be used for frustum, backface and occlusion culling.
			
 
				+ *
			
 
				+ * For backface culling with orthographic projection, use the following formula to reject backfacing clusters:
			
 
				+ *   dot(view, cone_axis) >= cone_cutoff
			
 
				+ *
			
 
				+ * For perspective projection, you can use the formula that needs cone apex in addition to axis & cutoff:
			
 
				+ *   dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
			
 
				+ *
			
 
				+ * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead:
			
 
				+ *   dot(normalize(center - camera_position), cone_axis) >= cone_cutoff + radius / length(center - camera_position)
			
 
				+ * or an equivalent formula that doesn't have a singularity at center = camera_position:
			
 
				+ *   dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius
			
 
				+ *
			
 
				+ * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
			
 
				+ * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable (for derivation see
			
 
				+ * Real-Time Rendering 4th Edition, section 19.3).
			
 
				+ *
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ * vertex_count should specify the number of vertices in the entire mesh, not cluster or meshlet
			
 
				+ * index_count/3 and triangle_count must not exceed implementation limits (<= 512)
			
 
				+ */
			
 
				+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Spatial sorter
			
 
				+ * Generates a remap table that can be used to reorder points for spatial locality.
			
 
				+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting remap table (vertex_count elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Experimental: Spatial sorter
			
 
				+ * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
			
 
				+ *
			
 
				+ * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				+ */
			
 
				+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+
			
 
				+/**
			
 
				+ * Set allocation callbacks
			
 
				+ * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
			
 
				+ * Note that all algorithms only allocate memory for temporary use.
			
 
				+ * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
			
 
				+ */
			
 
				+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+} /* extern "C" */
			
 
				+#endif
			
 
				+
			
 
				+/* Quantization into commonly supported data formats */
			
 
				+#ifdef __cplusplus
			
 
				+/**
			
 
				+ * Quantize a float in [0..1] range into an N-bit fixed point unorm value
			
 
				+ * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
			
 
				+ * Maximum reconstruction error: 1/2^(N+1)
			
 
				+ */
			
 
				+inline int meshopt_quantizeUnorm(float v, int N);
			
 
				+
			
 
				+/**
			
 
				+ * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
			
 
				+ * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
			
 
				+ * Maximum reconstruction error: 1/2^N
			
 
				+ */
			
 
				+inline int meshopt_quantizeSnorm(float v, int N);
			
 
				+
			
 
				+/**
			
 
				+ * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
			
 
				+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
			
 
				+ * Representable magnitude range: [6e-5; 65504]
			
 
				+ * Maximum relative reconstruction error: 5e-4
			
 
				+ */
			
 
				+MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
			
 
				+
			
 
				+/**
			
 
				+ * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
			
 
				+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
			
 
				+ * Assumes N is in a valid mantissa precision range, which is 1..23
			
 
				+ */
			
 
				+MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
			
 
				+
			
 
				+/**
			
 
				+ * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
			
 
				+ * Preserves Inf/NaN, flushes denormals to zero
			
 
				+ */
			
 
				+MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * C++ template interface
			
 
				+ *
			
 
				+ * These functions mirror the C interface the library provides, providing template-based overloads so that
			
 
				+ * the caller can use an arbitrary type for the index data, both for input and output.
			
 
				+ * When the supplied type is the same size as that of unsigned int, the wrappers are zero-cost; when it's not,
			
 
				+ * the wrappers end up allocating memory and copying index data to convert from one type to another.
			
 
				+ */
			
 
				+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
			
 
				+template <typename T>
			
 
				+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count);
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
			
 
				+template <typename T>
			
 
				+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
			
 
				+template <typename T>
			
 
				+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
			
 
				+template <typename T>
			
 
				+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
			
 
				+template <typename T>
			
 
				+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+template <typename T>
			
 
				+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
			
 
				+template <typename T>
			
 
				+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+template <typename T>
			
 
				+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				+#endif
			
 
				+
			
 
				+/* Inline implementation */
			
 
				+#ifdef __cplusplus
			
 
				+inline int meshopt_quantizeUnorm(float v, int N)
			
 
				+{
			
 
				+	const float scale = float((1 << N) - 1);
			
 
				+
			
 
				+	v = (v >= 0) ? v : 0;
			
 
				+	v = (v <= 1) ? v : 1;
			
 
				+
			
 
				+	return int(v * scale + 0.5f);
			
 
				+}
			
 
				+
			
 
				+inline int meshopt_quantizeSnorm(float v, int N)
			
 
				+{
			
 
				+	const float scale = float((1 << (N - 1)) - 1);
			
 
				+
			
 
				+	float round = (v >= 0 ? 0.5f : -0.5f);
			
 
				+
			
 
				+	v = (v >= -1) ? v : -1;
			
 
				+	v = (v <= +1) ? v : +1;
			
 
				+
			
 
				+	return int(v * scale + round);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* Internal implementation helpers */
			
 
				+#ifdef __cplusplus
			
 
				+class meshopt_Allocator
			
 
				+{
			
 
				+public:
			
 
				+	template <typename T>
			
 
				+	struct StorageT
			
 
				+	{
			
 
				+		static void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
			
 
				+		static void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
			
 
				+	};
			
 
				+
			
 
				+	typedef StorageT<void> Storage;
			
 
				+
			
 
				+	meshopt_Allocator()
			
 
				+	    : blocks()
			
 
				+	    , count(0)
			
 
				+	{
			
 
				+	}
			
 
				+
			
 
				+	~meshopt_Allocator()
			
 
				+	{
			
 
				+		for (size_t i = count; i > 0; --i)
			
 
				+			Storage::deallocate(blocks[i - 1]);
			
 
				+	}
			
 
				+
			
 
				+	template <typename T>
			
 
				+	T* allocate(size_t size)
			
 
				+	{
			
 
				+		assert(count < sizeof(blocks) / sizeof(blocks[0]));
			
 
				+		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
			
 
				+		blocks[count++] = result;
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	void deallocate(void* ptr)
			
 
				+	{
			
 
				+		assert(count > 0 && blocks[count - 1] == ptr);
			
 
				+		Storage::deallocate(ptr);
			
 
				+		count--;
			
 
				+	}
			
 
				+
			
 
				+private:
			
 
				+	void* blocks[24];
			
 
				+	size_t count;
			
 
				+};
			
 
				+
			
 
				+// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
			
 
				+template <typename T>
			
 
				+void* (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
			
 
				+template <typename T>
			
 
				+void (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
			
 
				+#endif
			
 
				+
			
 
				+/* Inline implementation for C++ templated wrappers */
			
 
				+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
			
 
				+template <typename T, bool ZeroCopy = sizeof(T) == sizeof(unsigned int)>
			
 
				+struct meshopt_IndexAdapter;
			
 
				+
			
 
				+template <typename T>
			
 
				+struct meshopt_IndexAdapter<T, false>
			
 
				+{
			
 
				+	T* result;
			
 
				+	unsigned int* data;
			
 
				+	size_t count;
			
 
				+
			
 
				+	meshopt_IndexAdapter(T* result_, const T* input, size_t count_)
			
 
				+	    : result(result_)
			
 
				+	    , data(NULL)
			
 
				+	    , count(count_)
			
 
				+	{
			
 
				+		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
			
 
				+
			
 
				+		data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
			
 
				+
			
 
				+		if (input)
			
 
				+		{
			
 
				+			for (size_t i = 0; i < count; ++i)
			
 
				+				data[i] = input[i];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	~meshopt_IndexAdapter()
			
 
				+	{
			
 
				+		if (result)
			
 
				+		{
			
 
				+			for (size_t i = 0; i < count; ++i)
			
 
				+				result[i] = T(data[i]);
			
 
				+		}
			
 
				+
			
 
				+		meshopt_Allocator::Storage::deallocate(data);
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+struct meshopt_IndexAdapter<T, true>
			
 
				+{
			
 
				+	unsigned int* data;
			
 
				+
			
 
				+	meshopt_IndexAdapter(T* result, const T* input, size_t)
			
 
				+	    : data(reinterpret_cast<unsigned int*>(result ? result : const_cast<T*>(input)))
			
 
				+	{
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
			
 
				+
			
 
				+	return meshopt_generateVertexRemap(destination, indices ? in.data : NULL, index_count, vertices, vertex_count, vertex_size);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
			
 
				+
			
 
				+	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
			
 
				+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
			
 
				+
			
 
				+	meshopt_remapIndexBuffer(out.data, indices ? in.data : NULL, index_count, remap);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count * 2);
			
 
				+
			
 
				+	meshopt_generateAdjacencyIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count * 4);
			
 
				+
			
 
				+	meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	size_t bound = vertex_count + (index_count / 3);
			
 
				+	assert(size_t(T(bound - 1)) == bound - 1); // bound - 1 must fit in T
			
 
				+	(void)bound;
			
 
				+
			
 
				+	return meshopt_generateProvokingIndexBuffer(out.data, reorder, in.data, index_count, vertex_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
			
 
				+
			
 
				+	return meshopt_optimizeVertexFetch(destination, inout.data, index_count, vertices, vertex_count, vertex_size);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
			
 
				+	(void)index_size_valid;
			
 
				+
			
 
				+	return meshopt_decodeIndexBuffer(destination, index_count, sizeof(T), buffer, buffer_size);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
			
 
				+	(void)index_size_valid;
			
 
				+
			
 
				+	return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, options, result_error);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, (index_count / 3) * 5);
			
 
				+
			
 
				+	return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index));
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, (index_count - 2) * 3);
			
 
				+
			
 
				+	return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index));
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_buildMeshlets(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, cone_weight);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+
			
 
				+	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
			
 
				+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
			
 
				+
			
 
				+	meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * Copyright (c) 2016-2024 Arseny Kapoulkine
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person
			
 
				+ * obtaining a copy of this software and associated documentation
			
 
				+ * files (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use,
			
 
				+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following
			
 
				+ * conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be
			
 
				+ * included in all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
			
 
				+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
			
 
				+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
			
 
				+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
			
 
				+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
			
 
				+ * OTHER DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
--- a/include/meshoptimizer/overdrawanalyzer.cpp
+++ b/include/meshoptimizer/overdrawanalyzer.cpp
@@ -0,0 +1,229 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <float.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// This work is based on:
			
 
				+// Nicolas Capens. Advanced Rasterization. 2004
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+const int kViewport = 256;
			
 
				+
			
 
				+struct OverdrawBuffer
			
 
				+{
			
 
				+	float z[kViewport][kViewport][2];
			
 
				+	unsigned int overdraw[kViewport][kViewport][2];
			
 
				+};
			
 
				+
			
 
				+#ifndef min
			
 
				+#define min(a, b) ((a) < (b) ? (a) : (b))
			
 
				+#endif
			
 
				+
			
 
				+#ifndef max
			
 
				+#define max(a, b) ((a) > (b) ? (a) : (b))
			
 
				+#endif
			
 
				+
			
 
				+static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
			
 
				+{
			
 
				+	// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
			
 
				+	// z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
			
 
				+	// (x2-x1 y2-y1)(dzdx) = (z2-z1)
			
 
				+	// (x3-x1 y3-y1)(dzdy)   (z3-z1)
			
 
				+	// we'll solve it with Cramer's rule
			
 
				+	float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
			
 
				+	float invdet = (det == 0) ? 0 : 1 / det;
			
 
				+
			
 
				+	dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
			
 
				+	dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
			
 
				+
			
 
				+	return det;
			
 
				+}
			
 
				+
			
 
				+// half-space fixed point triangle rasterizer
			
 
				+static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z)
			
 
				+{
			
 
				+	// compute depth gradients
			
 
				+	float DZx, DZy;
			
 
				+	float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
			
 
				+	int sign = det > 0;
			
 
				+
			
 
				+	// flip backfacing triangles to simplify rasterization logic
			
 
				+	if (sign)
			
 
				+	{
			
 
				+		// flipping v2 & v3 preserves depth gradients since they're based on v1; only v1z is used below
			
 
				+		float t;
			
 
				+		t = v2x, v2x = v3x, v3x = t;
			
 
				+		t = v2y, v2y = v3y, v3y = t;
			
 
				+
			
 
				+		// flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
			
 
				+		v1z = kViewport - v1z;
			
 
				+		DZx = -DZx;
			
 
				+		DZy = -DZy;
			
 
				+	}
			
 
				+
			
 
				+	// coordinates, 28.4 fixed point
			
 
				+	int X1 = int(16.0f * v1x + 0.5f);
			
 
				+	int X2 = int(16.0f * v2x + 0.5f);
			
 
				+	int X3 = int(16.0f * v3x + 0.5f);
			
 
				+
			
 
				+	int Y1 = int(16.0f * v1y + 0.5f);
			
 
				+	int Y2 = int(16.0f * v2y + 0.5f);
			
 
				+	int Y3 = int(16.0f * v3y + 0.5f);
			
 
				+
			
 
				+	// bounding rectangle, clipped against viewport
			
 
				+	// since we rasterize pixels with covered centers, min >0.5 should round up
			
 
				+	// as for max, due to top-left filling convention we will never rasterize right/bottom edges
			
 
				+	// so max >= 0.5 should round down
			
 
				+	int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
			
 
				+	int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
			
 
				+	int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
			
 
				+	int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
			
 
				+
			
 
				+	// deltas, 28.4 fixed point
			
 
				+	int DX12 = X1 - X2;
			
 
				+	int DX23 = X2 - X3;
			
 
				+	int DX31 = X3 - X1;
			
 
				+
			
 
				+	int DY12 = Y1 - Y2;
			
 
				+	int DY23 = Y2 - Y3;
			
 
				+	int DY31 = Y3 - Y1;
			
 
				+
			
 
				+	// fill convention correction
			
 
				+	int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0);
			
 
				+	int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0);
			
 
				+	int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0);
			
 
				+
			
 
				+	// half edge equations, 24.8 fixed point
			
 
				+	// note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
			
 
				+	int FX = (minx << 4) + 8;
			
 
				+	int FY = (miny << 4) + 8;
			
 
				+	int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1;
			
 
				+	int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1;
			
 
				+	int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1;
			
 
				+	float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f);
			
 
				+
			
 
				+	for (int y = miny; y < maxy; y++)
			
 
				+	{
			
 
				+		int CX1 = CY1;
			
 
				+		int CX2 = CY2;
			
 
				+		int CX3 = CY3;
			
 
				+		float ZX = ZY;
			
 
				+
			
 
				+		for (int x = minx; x < maxx; x++)
			
 
				+		{
			
 
				+			// check if all CXn are non-negative
			
 
				+			if ((CX1 | CX2 | CX3) >= 0)
			
 
				+			{
			
 
				+				if (ZX >= buffer->z[y][x][sign])
			
 
				+				{
			
 
				+					buffer->z[y][x][sign] = ZX;
			
 
				+					buffer->overdraw[y][x][sign]++;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			// signed left shift is UB for negative numbers so use unsigned-signed casts
			
 
				+			CX1 -= int(unsigned(DY12) << 4);
			
 
				+			CX2 -= int(unsigned(DY23) << 4);
			
 
				+			CX3 -= int(unsigned(DY31) << 4);
			
 
				+			ZX += DZx;
			
 
				+		}
			
 
				+
			
 
				+		// signed left shift is UB for negative numbers so use unsigned-signed casts
			
 
				+		CY1 += int(unsigned(DX12) << 4);
			
 
				+		CY2 += int(unsigned(DX23) << 4);
			
 
				+		CY3 += int(unsigned(DX31) << 4);
			
 
				+		ZY += DZy;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				+
			
 
				+	meshopt_OverdrawStatistics result = {};
			
 
				+
			
 
				+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
			
 
				+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		const float* v = vertex_positions + i * vertex_stride_float;
			
 
				+
			
 
				+		for (int j = 0; j < 3; ++j)
			
 
				+		{
			
 
				+			minv[j] = min(minv[j], v[j]);
			
 
				+			maxv[j] = max(maxv[j], v[j]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
			
 
				+	float scale = kViewport / extent;
			
 
				+
			
 
				+	float* triangles = allocator.allocate<float>(index_count * 3);
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		const float* v = vertex_positions + index * vertex_stride_float;
			
 
				+
			
 
				+		triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
			
 
				+		triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
			
 
				+		triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
			
 
				+	}
			
 
				+
			
 
				+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
			
 
				+
			
 
				+	for (int axis = 0; axis < 3; ++axis)
			
 
				+	{
			
 
				+		memset(buffer, 0, sizeof(OverdrawBuffer));
			
 
				+
			
 
				+		for (size_t i = 0; i < index_count; i += 3)
			
 
				+		{
			
 
				+			const float* vn0 = &triangles[3 * (i + 0)];
			
 
				+			const float* vn1 = &triangles[3 * (i + 1)];
			
 
				+			const float* vn2 = &triangles[3 * (i + 2)];
			
 
				+
			
 
				+			switch (axis)
			
 
				+			{
			
 
				+			case 0:
			
 
				+				rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
			
 
				+				break;
			
 
				+			case 1:
			
 
				+				rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
			
 
				+				break;
			
 
				+			case 2:
			
 
				+				rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		for (int y = 0; y < kViewport; ++y)
			
 
				+			for (int x = 0; x < kViewport; ++x)
			
 
				+				for (int s = 0; s < 2; ++s)
			
 
				+				{
			
 
				+					unsigned int overdraw = buffer->overdraw[y][x][s];
			
 
				+
			
 
				+					result.pixels_covered += overdraw > 0;
			
 
				+					result.pixels_shaded += overdraw;
			
 
				+				}
			
 
				+	}
			
 
				+
			
 
				+	result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f;
			
 
				+
			
 
				+	return result;
			
 
				+}
			
--- a/include/meshoptimizer/overdrawoptimizer.cpp
+++ b/include/meshoptimizer/overdrawoptimizer.cpp
@@ -0,0 +1,333 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <math.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// This work is based on:
			
 
				+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
			
 
				+{
			
 
				+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				+
			
 
				+	float mesh_centroid[3] = {};
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		const float* p = vertex_positions + vertex_stride_float * indices[i];
			
 
				+
			
 
				+		mesh_centroid[0] += p[0];
			
 
				+		mesh_centroid[1] += p[1];
			
 
				+		mesh_centroid[2] += p[2];
			
 
				+	}
			
 
				+
			
 
				+	mesh_centroid[0] /= index_count;
			
 
				+	mesh_centroid[1] /= index_count;
			
 
				+	mesh_centroid[2] /= index_count;
			
 
				+
			
 
				+	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
			
 
				+	{
			
 
				+		size_t cluster_begin = clusters[cluster] * 3;
			
 
				+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
			
 
				+		assert(cluster_begin < cluster_end);
			
 
				+
			
 
				+		float cluster_area = 0;
			
 
				+		float cluster_centroid[3] = {};
			
 
				+		float cluster_normal[3] = {};
			
 
				+
			
 
				+		for (size_t i = cluster_begin; i < cluster_end; i += 3)
			
 
				+		{
			
 
				+			const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
			
 
				+			const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
			
 
				+			const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
			
 
				+
			
 
				+			float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
			
 
				+			float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
			
 
				+
			
 
				+			float normalx = p10[1] * p20[2] - p10[2] * p20[1];
			
 
				+			float normaly = p10[2] * p20[0] - p10[0] * p20[2];
			
 
				+			float normalz = p10[0] * p20[1] - p10[1] * p20[0];
			
 
				+
			
 
				+			float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
			
 
				+
			
 
				+			cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
			
 
				+			cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
			
 
				+			cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
			
 
				+			cluster_normal[0] += normalx;
			
 
				+			cluster_normal[1] += normaly;
			
 
				+			cluster_normal[2] += normalz;
			
 
				+			cluster_area += area;
			
 
				+		}
			
 
				+
			
 
				+		float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
			
 
				+
			
 
				+		cluster_centroid[0] *= inv_cluster_area;
			
 
				+		cluster_centroid[1] *= inv_cluster_area;
			
 
				+		cluster_centroid[2] *= inv_cluster_area;
			
 
				+
			
 
				+		float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
			
 
				+		float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
			
 
				+
			
 
				+		cluster_normal[0] *= inv_cluster_normal_length;
			
 
				+		cluster_normal[1] *= inv_cluster_normal_length;
			
 
				+		cluster_normal[2] *= inv_cluster_normal_length;
			
 
				+
			
 
				+		float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
			
 
				+
			
 
				+		sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
			
 
				+{
			
 
				+	// compute sort data bounds and renormalize, using fixed point snorm
			
 
				+	float sort_data_max = 1e-3f;
			
 
				+
			
 
				+	for (size_t i = 0; i < cluster_count; ++i)
			
 
				+	{
			
 
				+		float dpa = fabsf(sort_data[i]);
			
 
				+
			
 
				+		sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
			
 
				+	}
			
 
				+
			
 
				+	const int sort_bits = 11;
			
 
				+
			
 
				+	for (size_t i = 0; i < cluster_count; ++i)
			
 
				+	{
			
 
				+		// note that we flip distribution since high dot product should come first
			
 
				+		float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
			
 
				+
			
 
				+		sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
			
 
				+	}
			
 
				+
			
 
				+	// fill histogram for counting sort
			
 
				+	unsigned int histogram[1 << sort_bits];
			
 
				+	memset(histogram, 0, sizeof(histogram));
			
 
				+
			
 
				+	for (size_t i = 0; i < cluster_count; ++i)
			
 
				+	{
			
 
				+		histogram[sort_keys[i]]++;
			
 
				+	}
			
 
				+
			
 
				+	// compute offsets based on histogram data
			
 
				+	size_t histogram_sum = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < 1 << sort_bits; ++i)
			
 
				+	{
			
 
				+		size_t count = histogram[i];
			
 
				+		histogram[i] = unsigned(histogram_sum);
			
 
				+		histogram_sum += count;
			
 
				+	}
			
 
				+
			
 
				+	assert(histogram_sum == cluster_count);
			
 
				+
			
 
				+	// compute sort order based on offsets
			
 
				+	for (size_t i = 0; i < cluster_count; ++i)
			
 
				+	{
			
 
				+		sort_order[histogram[sort_keys[i]]++] = unsigned(i);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
			
 
				+{
			
 
				+	unsigned int cache_misses = 0;
			
 
				+
			
 
				+	// if vertex is not in cache, put it in cache
			
 
				+	if (timestamp - cache_timestamps[a] > cache_size)
			
 
				+	{
			
 
				+		cache_timestamps[a] = timestamp++;
			
 
				+		cache_misses++;
			
 
				+	}
			
 
				+
			
 
				+	if (timestamp - cache_timestamps[b] > cache_size)
			
 
				+	{
			
 
				+		cache_timestamps[b] = timestamp++;
			
 
				+		cache_misses++;
			
 
				+	}
			
 
				+
			
 
				+	if (timestamp - cache_timestamps[c] > cache_size)
			
 
				+	{
			
 
				+		cache_timestamps[c] = timestamp++;
			
 
				+		cache_misses++;
			
 
				+	}
			
 
				+
			
 
				+	return cache_misses;
			
 
				+}
			
 
				+
			
 
				+static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
			
 
				+{
			
 
				+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	unsigned int timestamp = cache_size + 1;
			
 
				+
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	size_t result = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
			
 
				+
			
 
				+		// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
			
 
				+		// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
			
 
				+		// suggests an inefficiency in the vertex cache optimization algorithm
			
 
				+		// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
			
 
				+		if (i == 0 || m == 3)
			
 
				+		{
			
 
				+			destination[result++] = unsigned(i);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	assert(result <= index_count / 3);
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
			
 
				+{
			
 
				+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	unsigned int timestamp = 0;
			
 
				+
			
 
				+	size_t result = 0;
			
 
				+
			
 
				+	for (size_t it = 0; it < cluster_count; ++it)
			
 
				+	{
			
 
				+		size_t start = clusters[it];
			
 
				+		size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
			
 
				+		assert(start < end);
			
 
				+
			
 
				+		// reset cache
			
 
				+		timestamp += cache_size + 1;
			
 
				+
			
 
				+		// measure cluster ACMR
			
 
				+		unsigned int cluster_misses = 0;
			
 
				+
			
 
				+		for (size_t i = start; i < end; ++i)
			
 
				+		{
			
 
				+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
			
 
				+
			
 
				+			cluster_misses += m;
			
 
				+		}
			
 
				+
			
 
				+		float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
			
 
				+
			
 
				+		// first cluster always starts from the hard cluster boundary
			
 
				+		destination[result++] = unsigned(start);
			
 
				+
			
 
				+		// reset cache
			
 
				+		timestamp += cache_size + 1;
			
 
				+
			
 
				+		unsigned int running_misses = 0;
			
 
				+		unsigned int running_faces = 0;
			
 
				+
			
 
				+		for (size_t i = start; i < end; ++i)
			
 
				+		{
			
 
				+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
			
 
				+
			
 
				+			running_misses += m;
			
 
				+			running_faces += 1;
			
 
				+
			
 
				+			if (float(running_misses) / float(running_faces) <= cluster_threshold)
			
 
				+			{
			
 
				+				// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
			
 
				+				// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
			
 
				+				// cluster is empty; however, the 'pop_back' after the loop will clean it up
			
 
				+				destination[result++] = unsigned(i + 1);
			
 
				+
			
 
				+				// reset cache
			
 
				+				timestamp += cache_size + 1;
			
 
				+
			
 
				+				running_misses = 0;
			
 
				+				running_faces = 0;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// each time we reach the target ACMR we flush the cluster
			
 
				+		// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
			
 
				+		// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
			
 
				+		// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
			
 
				+		// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
			
 
				+		// to the cluster boundary array which we need to remove anyway - this code will do that automatically
			
 
				+		if (destination[result - 1] != start)
			
 
				+		{
			
 
				+			result--;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	assert(result >= cluster_count);
			
 
				+	assert(result <= index_count / 3);
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	// guard for empty meshes
			
 
				+	if (index_count == 0 || vertex_count == 0)
			
 
				+		return;
			
 
				+
			
 
				+	// support in-place optimization
			
 
				+	if (destination == indices)
			
 
				+	{
			
 
				+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
			
 
				+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
			
 
				+		indices = indices_copy;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int cache_size = 16;
			
 
				+
			
 
				+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
			
 
				+
			
 
				+	// generate hard boundaries from full-triangle cache misses
			
 
				+	unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
			
 
				+	size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
			
 
				+
			
 
				+	// generate soft boundaries
			
 
				+	unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
			
 
				+	size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
			
 
				+
			
 
				+	const unsigned int* clusters = soft_clusters;
			
 
				+	size_t cluster_count = soft_cluster_count;
			
 
				+
			
 
				+	// fill sort data
			
 
				+	float* sort_data = allocator.allocate<float>(cluster_count);
			
 
				+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
			
 
				+
			
 
				+	// sort clusters using sort data
			
 
				+	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
			
 
				+	unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
			
 
				+	calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
			
 
				+
			
 
				+	// fill output buffer
			
 
				+	size_t offset = 0;
			
 
				+
			
 
				+	for (size_t it = 0; it < cluster_count; ++it)
			
 
				+	{
			
 
				+		unsigned int cluster = sort_order[it];
			
 
				+		assert(cluster < cluster_count);
			
 
				+
			
 
				+		size_t cluster_begin = clusters[cluster] * 3;
			
 
				+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
			
 
				+		assert(cluster_begin < cluster_end);
			
 
				+
			
 
				+		memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
			
 
				+		offset += cluster_end - cluster_begin;
			
 
				+	}
			
 
				+
			
 
				+	assert(offset == index_count);
			
 
				+}
			
--- a/include/meshoptimizer/quantization.cpp
+++ b/include/meshoptimizer/quantization.cpp
@@ -0,0 +1,76 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+
			
 
				+union FloatBits
			
 
				+{
			
 
				+	float f;
			
 
				+	unsigned int ui;
			
 
				+};
			
 
				+
			
 
				+unsigned short meshopt_quantizeHalf(float v)
			
 
				+{
			
 
				+	FloatBits u = {v};
			
 
				+	unsigned int ui = u.ui;
			
 
				+
			
 
				+	int s = (ui >> 16) & 0x8000;
			
 
				+	int em = ui & 0x7fffffff;
			
 
				+
			
 
				+	// bias exponent and round to nearest; 112 is relative exponent bias (127-15)
			
 
				+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
			
 
				+
			
 
				+	// underflow: flush to zero; 113 encodes exponent -14
			
 
				+	h = (em < (113 << 23)) ? 0 : h;
			
 
				+
			
 
				+	// overflow: infinity; 143 encodes exponent 16
			
 
				+	h = (em >= (143 << 23)) ? 0x7c00 : h;
			
 
				+
			
 
				+	// NaN; note that we convert all types of NaN to qNaN
			
 
				+	h = (em > (255 << 23)) ? 0x7e00 : h;
			
 
				+
			
 
				+	return (unsigned short)(s | h);
			
 
				+}
			
 
				+
			
 
				+float meshopt_quantizeFloat(float v, int N)
			
 
				+{
			
 
				+	assert(N >= 0 && N <= 23);
			
 
				+
			
 
				+	FloatBits u = {v};
			
 
				+	unsigned int ui = u.ui;
			
 
				+
			
 
				+	const int mask = (1 << (23 - N)) - 1;
			
 
				+	const int round = (1 << (23 - N)) >> 1;
			
 
				+
			
 
				+	int e = ui & 0x7f800000;
			
 
				+	unsigned int rui = (ui + round) & ~mask;
			
 
				+
			
 
				+	// round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0
			
 
				+	ui = e == 0x7f800000 ? ui : rui;
			
 
				+
			
 
				+	// flush denormals to zero
			
 
				+	ui = e == 0 ? 0 : ui;
			
 
				+
			
 
				+	u.ui = ui;
			
 
				+	return u.f;
			
 
				+}
			
 
				+
			
 
				+float meshopt_dequantizeHalf(unsigned short h)
			
 
				+{
			
 
				+	unsigned int s = unsigned(h & 0x8000) << 16;
			
 
				+	int em = h & 0x7fff;
			
 
				+
			
 
				+	// bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
			
 
				+	int r = (em + (112 << 10)) << 13;
			
 
				+
			
 
				+	// denormal: flush to zero
			
 
				+	r = (em < (1 << 10)) ? 0 : r;
			
 
				+
			
 
				+	// infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
			
 
				+	// 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
			
 
				+	r += (em >= (31 << 10)) ? (112 << 23) : 0;
			
 
				+
			
 
				+	FloatBits u;
			
 
				+	u.ui = s | r;
			
 
				+	return u.f;
			
 
				+}
			
--- a/include/meshoptimizer/simplifier.cpp
+++ b/include/meshoptimizer/simplifier.cpp
@@ -0,0 +1,2332 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <float.h>
			
 
				+#include <math.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+#ifndef TRACE
			
 
				+#define TRACE 0
			
 
				+#endif
			
 
				+
			
 
				+#if TRACE
			
 
				+#include <stdio.h>
			
 
				+#endif
			
 
				+
			
 
				+#if TRACE
			
 
				+#define TRACESTATS(i) stats[i]++;
			
 
				+#else
			
 
				+#define TRACESTATS(i) (void)0
			
 
				+#endif
			
 
				+
			
 
				+// This work is based on:
			
 
				+// Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
			
 
				+// Michael Garland. Quadric-based polygonal surface simplification. 1999
			
 
				+// Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000
			
 
				+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
			
 
				+// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
			
 
				+// Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+struct EdgeAdjacency
			
 
				+{
			
 
				+	struct Edge
			
 
				+	{
			
 
				+		unsigned int next;
			
 
				+		unsigned int prev;
			
 
				+	};
			
 
				+
			
 
				+	unsigned int* offsets;
			
 
				+	Edge* data;
			
 
				+};
			
 
				+
			
 
				+static void prepareEdgeAdjacency(EdgeAdjacency& adjacency, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
			
 
				+{
			
 
				+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count + 1);
			
 
				+	adjacency.data = allocator.allocate<EdgeAdjacency::Edge>(index_count);
			
 
				+}
			
 
				+
			
 
				+static void updateEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* remap)
			
 
				+{
			
 
				+	size_t face_count = index_count / 3;
			
 
				+	unsigned int* offsets = adjacency.offsets + 1;
			
 
				+	EdgeAdjacency::Edge* data = adjacency.data;
			
 
				+
			
 
				+	// fill edge counts
			
 
				+	memset(offsets, 0, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int v = remap ? remap[indices[i]] : indices[i];
			
 
				+		assert(v < vertex_count);
			
 
				+
			
 
				+		offsets[v]++;
			
 
				+	}
			
 
				+
			
 
				+	// fill offset table
			
 
				+	unsigned int offset = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int count = offsets[i];
			
 
				+		offsets[i] = offset;
			
 
				+		offset += count;
			
 
				+	}
			
 
				+
			
 
				+	assert(offset == index_count);
			
 
				+
			
 
				+	// fill edge data
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
			
 
				+
			
 
				+		if (remap)
			
 
				+		{
			
 
				+			a = remap[a];
			
 
				+			b = remap[b];
			
 
				+			c = remap[c];
			
 
				+		}
			
 
				+
			
 
				+		data[offsets[a]].next = b;
			
 
				+		data[offsets[a]].prev = c;
			
 
				+		offsets[a]++;
			
 
				+
			
 
				+		data[offsets[b]].next = c;
			
 
				+		data[offsets[b]].prev = a;
			
 
				+		offsets[b]++;
			
 
				+
			
 
				+		data[offsets[c]].next = a;
			
 
				+		data[offsets[c]].prev = b;
			
 
				+		offsets[c]++;
			
 
				+	}
			
 
				+
			
 
				+	// finalize offsets
			
 
				+	adjacency.offsets[0] = 0;
			
 
				+	assert(adjacency.offsets[vertex_count] == index_count);
			
 
				+}
			
 
				+
			
 
				+struct PositionHasher
			
 
				+{
			
 
				+	const float* vertex_positions;
			
 
				+	size_t vertex_stride_float;
			
 
				+	const unsigned int* sparse_remap;
			
 
				+
			
 
				+	size_t hash(unsigned int index) const
			
 
				+	{
			
 
				+		unsigned int ri = sparse_remap ? sparse_remap[index] : index;
			
 
				+		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + ri * vertex_stride_float);
			
 
				+
			
 
				+		// scramble bits to make sure that integer coordinates have entropy in lower bits
			
 
				+		unsigned int x = key[0] ^ (key[0] >> 17);
			
 
				+		unsigned int y = key[1] ^ (key[1] >> 17);
			
 
				+		unsigned int z = key[2] ^ (key[2] >> 17);
			
 
				+
			
 
				+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
			
 
				+		return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned int lhs, unsigned int rhs) const
			
 
				+	{
			
 
				+		unsigned int li = sparse_remap ? sparse_remap[lhs] : lhs;
			
 
				+		unsigned int ri = sparse_remap ? sparse_remap[rhs] : rhs;
			
 
				+
			
 
				+		return memcmp(vertex_positions + li * vertex_stride_float, vertex_positions + ri * vertex_stride_float, sizeof(float) * 3) == 0;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+struct RemapHasher
			
 
				+{
			
 
				+	unsigned int* remap;
			
 
				+
			
 
				+	size_t hash(unsigned int id) const
			
 
				+	{
			
 
				+		return id * 0x5bd1e995;
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned int lhs, unsigned int rhs) const
			
 
				+	{
			
 
				+		return remap[lhs] == rhs;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+static size_t hashBuckets2(size_t count)
			
 
				+{
			
 
				+	size_t buckets = 1;
			
 
				+	while (buckets < count + count / 4)
			
 
				+		buckets *= 2;
			
 
				+
			
 
				+	return buckets;
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename Hash>
			
 
				+static T* hashLookup2(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
			
 
				+{
			
 
				+	assert(buckets > 0);
			
 
				+	assert((buckets & (buckets - 1)) == 0);
			
 
				+
			
 
				+	size_t hashmod = buckets - 1;
			
 
				+	size_t bucket = hash.hash(key) & hashmod;
			
 
				+
			
 
				+	for (size_t probe = 0; probe <= hashmod; ++probe)
			
 
				+	{
			
 
				+		T& item = table[bucket];
			
 
				+
			
 
				+		if (item == empty)
			
 
				+			return &item;
			
 
				+
			
 
				+		if (hash.equal(item, key))
			
 
				+			return &item;
			
 
				+
			
 
				+		// hash collision, quadratic probing
			
 
				+		bucket = (bucket + probe + 1) & hashmod;
			
 
				+	}
			
 
				+
			
 
				+	assert(false && "Hash table is full"); // unreachable
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap, meshopt_Allocator& allocator)
			
 
				+{
			
 
				+	PositionHasher hasher = {vertex_positions_data, vertex_positions_stride / sizeof(float), sparse_remap};
			
 
				+
			
 
				+	size_t table_size = hashBuckets2(vertex_count);
			
 
				+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
			
 
				+	memset(table, -1, table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	// build forward remap: for each vertex, which other (canonical) vertex does it map to?
			
 
				+	// we use position equivalence for this, and remap vertices to other existing vertices
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = unsigned(i);
			
 
				+		unsigned int* entry = hashLookup2(table, table_size, hasher, index, ~0u);
			
 
				+
			
 
				+		if (*entry == ~0u)
			
 
				+			*entry = index;
			
 
				+
			
 
				+		remap[index] = *entry;
			
 
				+	}
			
 
				+
			
 
				+	// build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex?
			
 
				+	// entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		wedge[i] = unsigned(i);
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		if (remap[i] != i)
			
 
				+		{
			
 
				+			unsigned int r = remap[i];
			
 
				+
			
 
				+			wedge[i] = wedge[r];
			
 
				+			wedge[r] = unsigned(i);
			
 
				+		}
			
 
				+
			
 
				+	allocator.deallocate(table);
			
 
				+}
			
 
				+
			
 
				+static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, size_t vertex_count, size_t* out_vertex_count, meshopt_Allocator& allocator)
			
 
				+{
			
 
				+	// use a bit set to compute the precise number of unique vertices
			
 
				+	unsigned char* filter = allocator.allocate<unsigned char>((vertex_count + 7) / 8);
			
 
				+	memset(filter, 0, (vertex_count + 7) / 8);
			
 
				+
			
 
				+	size_t unique = 0;
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		unique += (filter[index / 8] & (1 << (index % 8))) == 0;
			
 
				+		filter[index / 8] |= 1 << (index % 8);
			
 
				+	}
			
 
				+
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(unique);
			
 
				+	size_t offset = 0;
			
 
				+
			
 
				+	// temporary map dense => sparse; we allocate it last so that we can deallocate it
			
 
				+	size_t revremap_size = hashBuckets2(unique);
			
 
				+	unsigned int* revremap = allocator.allocate<unsigned int>(revremap_size);
			
 
				+	memset(revremap, -1, revremap_size * sizeof(unsigned int));
			
 
				+
			
 
				+	// fill remap, using revremap as a helper, and rewrite indices in the same pass
			
 
				+	RemapHasher hasher = {remap};
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+
			
 
				+		unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u);
			
 
				+
			
 
				+		if (*entry == ~0u)
			
 
				+		{
			
 
				+			remap[offset] = index;
			
 
				+			*entry = unsigned(offset);
			
 
				+			offset++;
			
 
				+		}
			
 
				+
			
 
				+		indices[i] = *entry;
			
 
				+	}
			
 
				+
			
 
				+	allocator.deallocate(revremap);
			
 
				+
			
 
				+	assert(offset == unique);
			
 
				+	*out_vertex_count = unique;
			
 
				+
			
 
				+	return remap;
			
 
				+}
			
 
				+
			
 
				+enum VertexKind
			
 
				+{
			
 
				+	Kind_Manifold, // not on an attribute seam, not on any boundary
			
 
				+	Kind_Border,   // not on an attribute seam, has exactly two open edges
			
 
				+	Kind_Seam,     // on an attribute seam with exactly two attribute seam edges
			
 
				+	Kind_Complex,  // none of the above; these vertices can move as long as all wedges move to the target vertex
			
 
				+	Kind_Locked,   // none of the above; these vertices can't move
			
 
				+
			
 
				+	Kind_Count
			
 
				+};
			
 
				+
			
 
				+// manifold vertices can collapse onto anything
			
 
				+// border/seam vertices can collapse onto border/seam respectively, or locked
			
 
				+// complex vertices can collapse onto complex/locked
			
 
				+// a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex
			
 
				+// for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore
			
 
				+const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
			
 
				+    {1, 1, 1, 1, 1},
			
 
				+    {0, 1, 0, 0, 1},
			
 
				+    {0, 0, 1, 0, 1},
			
 
				+    {0, 0, 0, 1, 1},
			
 
				+    {0, 0, 0, 0, 0},
			
 
				+};
			
 
				+
			
 
				+// if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge
			
 
				+// note that for seam edges, the opposite edge isn't present in the attribute-based topology
			
 
				+// but is present if you consider a position-only mesh variant
			
 
				+const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
			
 
				+    {1, 1, 1, 0, 1},
			
 
				+    {1, 0, 1, 0, 0},
			
 
				+    {1, 1, 1, 0, 1},
			
 
				+    {0, 0, 0, 0, 0},
			
 
				+    {1, 0, 1, 0, 0},
			
 
				+};
			
 
				+
			
 
				+static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b)
			
 
				+{
			
 
				+	unsigned int count = adjacency.offsets[a + 1] - adjacency.offsets[a];
			
 
				+	const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[a];
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+		if (edges[i].next == b)
			
 
				+			return true;
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_lock, const unsigned int* sparse_remap, unsigned int options)
			
 
				+{
			
 
				+	memset(loop, -1, vertex_count * sizeof(unsigned int));
			
 
				+	memset(loopback, -1, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	// incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
			
 
				+	// note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
			
 
				+	// but here it's okay to fill the data out for other types of vertices as well
			
 
				+	unsigned int* openinc = loopback;
			
 
				+	unsigned int* openout = loop;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int vertex = unsigned(i);
			
 
				+
			
 
				+		unsigned int count = adjacency.offsets[vertex + 1] - adjacency.offsets[vertex];
			
 
				+		const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[vertex];
			
 
				+
			
 
				+		for (size_t j = 0; j < count; ++j)
			
 
				+		{
			
 
				+			unsigned int target = edges[j].next;
			
 
				+
			
 
				+			if (target == vertex)
			
 
				+			{
			
 
				+				// degenerate triangles have two distinct edges instead of three, and the self edge
			
 
				+				// is bi-directional by definition; this can break border/seam classification by "closing"
			
 
				+				// the open edge from another triangle and falsely marking the vertex as manifold
			
 
				+				// instead we mark the vertex as having >1 open edges which turns it into locked/complex
			
 
				+				openinc[vertex] = openout[vertex] = vertex;
			
 
				+			}
			
 
				+			else if (!hasEdge(adjacency, target, vertex))
			
 
				+			{
			
 
				+				openinc[target] = (openinc[target] == ~0u) ? vertex : target;
			
 
				+				openout[vertex] = (openout[vertex] == ~0u) ? target : vertex;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+#if TRACE
			
 
				+	size_t stats[4] = {};
			
 
				+#endif
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		if (remap[i] == i)
			
 
				+		{
			
 
				+			if (wedge[i] == i)
			
 
				+			{
			
 
				+				// no attribute seam, need to check if it's manifold
			
 
				+				unsigned int openi = openinc[i], openo = openout[i];
			
 
				+
			
 
				+				// note: we classify any vertices with no open edges as manifold
			
 
				+				// this is technically incorrect - if 4 triangles share an edge, we'll classify vertices as manifold
			
 
				+				// it's unclear if this is a problem in practice
			
 
				+				if (openi == ~0u && openo == ~0u)
			
 
				+				{
			
 
				+					result[i] = Kind_Manifold;
			
 
				+				}
			
 
				+				else if (openi != i && openo != i)
			
 
				+				{
			
 
				+					result[i] = Kind_Border;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					result[i] = Kind_Locked;
			
 
				+					TRACESTATS(0);
			
 
				+				}
			
 
				+			}
			
 
				+			else if (wedge[wedge[i]] == i)
			
 
				+			{
			
 
				+				// attribute seam; need to distinguish between Seam and Locked
			
 
				+				unsigned int w = wedge[i];
			
 
				+				unsigned int openiv = openinc[i], openov = openout[i];
			
 
				+				unsigned int openiw = openinc[w], openow = openout[w];
			
 
				+
			
 
				+				// seam should have one open half-edge for each vertex, and the edges need to "connect" - point to the same vertex post-remap
			
 
				+				if (openiv != ~0u && openiv != i && openov != ~0u && openov != i &&
			
 
				+				    openiw != ~0u && openiw != w && openow != ~0u && openow != w)
			
 
				+				{
			
 
				+					if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw] && remap[openiv] != remap[openov])
			
 
				+					{
			
 
				+						result[i] = Kind_Seam;
			
 
				+					}
			
 
				+					else
			
 
				+					{
			
 
				+						result[i] = Kind_Locked;
			
 
				+						TRACESTATS(1);
			
 
				+					}
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					result[i] = Kind_Locked;
			
 
				+					TRACESTATS(2);
			
 
				+				}
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// more than one vertex maps to this one; we don't have classification available
			
 
				+				result[i] = Kind_Locked;
			
 
				+				TRACESTATS(3);
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			assert(remap[i] < i);
			
 
				+
			
 
				+			result[i] = result[remap[i]];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (vertex_lock)
			
 
				+	{
			
 
				+		// vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+			if (vertex_lock[sparse_remap ? sparse_remap[i] : i])
			
 
				+				result[remap[i]] = Kind_Locked;
			
 
				+
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+			if (result[remap[i]] == Kind_Locked)
			
 
				+				result[i] = Kind_Locked;
			
 
				+	}
			
 
				+
			
 
				+	if (options & meshopt_SimplifyLockBorder)
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+			if (result[i] == Kind_Border)
			
 
				+				result[i] = Kind_Locked;
			
 
				+
			
 
				+#if TRACE
			
 
				+	printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n",
			
 
				+	    int(stats[0]), int(stats[1]), int(stats[2]), int(stats[3]));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+struct Vector3
			
 
				+{
			
 
				+	float x, y, z;
			
 
				+};
			
 
				+
			
 
				+static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL)
			
 
				+{
			
 
				+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				+
			
 
				+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
			
 
				+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
			
 
				+		const float* v = vertex_positions_data + ri * vertex_stride_float;
			
 
				+
			
 
				+		if (result)
			
 
				+		{
			
 
				+			result[i].x = v[0];
			
 
				+			result[i].y = v[1];
			
 
				+			result[i].z = v[2];
			
 
				+		}
			
 
				+
			
 
				+		for (int j = 0; j < 3; ++j)
			
 
				+		{
			
 
				+			float vj = v[j];
			
 
				+
			
 
				+			minv[j] = minv[j] > vj ? vj : minv[j];
			
 
				+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	float extent = 0.f;
			
 
				+
			
 
				+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
			
 
				+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
			
 
				+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
			
 
				+
			
 
				+	if (result)
			
 
				+	{
			
 
				+		float scale = extent == 0 ? 0.f : 1.f / extent;
			
 
				+
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		{
			
 
				+			result[i].x = (result[i].x - minv[0]) * scale;
			
 
				+			result[i].y = (result[i].y - minv[1]) * scale;
			
 
				+			result[i].z = (result[i].z - minv[2]) * scale;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return extent;
			
 
				+}
			
 
				+
			
 
				+static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* attribute_remap, const unsigned int* sparse_remap)
			
 
				+{
			
 
				+	size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
			
 
				+
			
 
				+		for (size_t k = 0; k < attribute_count; ++k)
			
 
				+		{
			
 
				+			unsigned int rk = attribute_remap[k];
			
 
				+			float a = vertex_attributes_data[ri * vertex_attributes_stride_float + rk];
			
 
				+
			
 
				+			result[i * attribute_count + k] = a * attribute_weights[rk];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static const size_t kMaxAttributes = 32;
			
 
				+
			
 
				+struct Quadric
			
 
				+{
			
 
				+	// a00*x^2 + a11*y^2 + a22*z^2 + 2*(a10*xy + a20*xz + a21*yz) + b0*x + b1*y + b2*z + c
			
 
				+	float a00, a11, a22;
			
 
				+	float a10, a20, a21;
			
 
				+	float b0, b1, b2, c;
			
 
				+	float w;
			
 
				+};
			
 
				+
			
 
				+struct QuadricGrad
			
 
				+{
			
 
				+	// gx*x + gy*y + gz*z + gw
			
 
				+	float gx, gy, gz, gw;
			
 
				+};
			
 
				+
			
 
				+struct Reservoir
			
 
				+{
			
 
				+	float x, y, z;
			
 
				+	float r, g, b;
			
 
				+	float w;
			
 
				+};
			
 
				+
			
 
				+struct Collapse
			
 
				+{
			
 
				+	unsigned int v0;
			
 
				+	unsigned int v1;
			
 
				+
			
 
				+	union
			
 
				+	{
			
 
				+		unsigned int bidi;
			
 
				+		float error;
			
 
				+		unsigned int errorui;
			
 
				+	};
			
 
				+};
			
 
				+
			
 
				+static float normalize(Vector3& v)
			
 
				+{
			
 
				+	float length = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
			
 
				+
			
 
				+	if (length > 0)
			
 
				+	{
			
 
				+		v.x /= length;
			
 
				+		v.y /= length;
			
 
				+		v.z /= length;
			
 
				+	}
			
 
				+
			
 
				+	return length;
			
 
				+}
			
 
				+
			
 
				+static void quadricAdd(Quadric& Q, const Quadric& R)
			
 
				+{
			
 
				+	Q.a00 += R.a00;
			
 
				+	Q.a11 += R.a11;
			
 
				+	Q.a22 += R.a22;
			
 
				+	Q.a10 += R.a10;
			
 
				+	Q.a20 += R.a20;
			
 
				+	Q.a21 += R.a21;
			
 
				+	Q.b0 += R.b0;
			
 
				+	Q.b1 += R.b1;
			
 
				+	Q.b2 += R.b2;
			
 
				+	Q.c += R.c;
			
 
				+	Q.w += R.w;
			
 
				+}
			
 
				+
			
 
				+static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_count)
			
 
				+{
			
 
				+	for (size_t k = 0; k < attribute_count; ++k)
			
 
				+	{
			
 
				+		G[k].gx += R[k].gx;
			
 
				+		G[k].gy += R[k].gy;
			
 
				+		G[k].gz += R[k].gz;
			
 
				+		G[k].gw += R[k].gw;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static float quadricEval(const Quadric& Q, const Vector3& v)
			
 
				+{
			
 
				+	float rx = Q.b0;
			
 
				+	float ry = Q.b1;
			
 
				+	float rz = Q.b2;
			
 
				+
			
 
				+	rx += Q.a10 * v.y;
			
 
				+	ry += Q.a21 * v.z;
			
 
				+	rz += Q.a20 * v.x;
			
 
				+
			
 
				+	rx *= 2;
			
 
				+	ry *= 2;
			
 
				+	rz *= 2;
			
 
				+
			
 
				+	rx += Q.a00 * v.x;
			
 
				+	ry += Q.a11 * v.y;
			
 
				+	rz += Q.a22 * v.z;
			
 
				+
			
 
				+	float r = Q.c;
			
 
				+	r += rx * v.x;
			
 
				+	r += ry * v.y;
			
 
				+	r += rz * v.z;
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static float quadricError(const Quadric& Q, const Vector3& v)
			
 
				+{
			
 
				+	float r = quadricEval(Q, v);
			
 
				+	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
			
 
				+
			
 
				+	return fabsf(r) * s;
			
 
				+}
			
 
				+
			
 
				+static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribute_count, const Vector3& v, const float* va)
			
 
				+{
			
 
				+	float r = quadricEval(Q, v);
			
 
				+
			
 
				+	// see quadricFromAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
			
 
				+	for (size_t k = 0; k < attribute_count; ++k)
			
 
				+	{
			
 
				+		float a = va[k];
			
 
				+		float g = v.x * G[k].gx + v.y * G[k].gy + v.z * G[k].gz + G[k].gw;
			
 
				+
			
 
				+		r += a * (a * Q.w - 2 * g);
			
 
				+	}
			
 
				+
			
 
				+	// note: unlike position error, we do not normalize by Q.w to retain edge scaling as described in quadricFromAttributes
			
 
				+	return fabsf(r);
			
 
				+}
			
 
				+
			
 
				+static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
			
 
				+{
			
 
				+	float aw = a * w;
			
 
				+	float bw = b * w;
			
 
				+	float cw = c * w;
			
 
				+	float dw = d * w;
			
 
				+
			
 
				+	Q.a00 = a * aw;
			
 
				+	Q.a11 = b * bw;
			
 
				+	Q.a22 = c * cw;
			
 
				+	Q.a10 = a * bw;
			
 
				+	Q.a20 = a * cw;
			
 
				+	Q.a21 = b * cw;
			
 
				+	Q.b0 = a * dw;
			
 
				+	Q.b1 = b * dw;
			
 
				+	Q.b2 = c * dw;
			
 
				+	Q.c = d * dw;
			
 
				+	Q.w = w;
			
 
				+}
			
 
				+
			
 
				+static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
			
 
				+{
			
 
				+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
			
 
				+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
			
 
				+
			
 
				+	// normal = cross(p1 - p0, p2 - p0)
			
 
				+	Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
			
 
				+	float area = normalize(normal);
			
 
				+
			
 
				+	float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
			
 
				+
			
 
				+	// we use sqrtf(area) so that the error is scaled linearly; this tends to improve silhouettes
			
 
				+	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, sqrtf(area) * weight);
			
 
				+}
			
 
				+
			
 
				+static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
			
 
				+{
			
 
				+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
			
 
				+
			
 
				+	// edge length; keep squared length around for projection correction
			
 
				+	float lengthsq = p10.x * p10.x + p10.y * p10.y + p10.z * p10.z;
			
 
				+	float length = sqrtf(lengthsq);
			
 
				+
			
 
				+	// p20p = length of projection of p2-p0 onto p1-p0; note that p10 is unnormalized so we need to correct it later
			
 
				+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
			
 
				+	float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z;
			
 
				+
			
 
				+	// perp = perpendicular vector from p2 to line segment p1-p0
			
 
				+	// note: since p10 is unnormalized we need to correct the projection; we scale p20 instead to take advantage of normalize below
			
 
				+	Vector3 perp = {p20.x * lengthsq - p10.x * p20p, p20.y * lengthsq - p10.y * p20p, p20.z * lengthsq - p10.z * p20p};
			
 
				+	normalize(perp);
			
 
				+
			
 
				+	float distance = perp.x * p0.x + perp.y * p0.y + perp.z * p0.z;
			
 
				+
			
 
				+	// note: the weight is scaled linearly with edge length; this has to match the triangle weight
			
 
				+	quadricFromPlane(Q, perp.x, perp.y, perp.z, -distance, length * weight);
			
 
				+}
			
 
				+
			
 
				+static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, const Vector3& p1, const Vector3& p2, const float* va0, const float* va1, const float* va2, size_t attribute_count)
			
 
				+{
			
 
				+	// for each attribute we want to encode the following function into the quadric:
			
 
				+	// (eval(pos) - attr)^2
			
 
				+	// where eval(pos) interpolates attribute across the triangle like so:
			
 
				+	// eval(pos) = pos.x * gx + pos.y * gy + pos.z * gz + gw
			
 
				+	// where gx/gy/gz/gw are gradients
			
 
				+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
			
 
				+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
			
 
				+
			
 
				+	// normal = cross(p1 - p0, p2 - p0)
			
 
				+	Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
			
 
				+	float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z) * 0.5f;
			
 
				+
			
 
				+	// quadric is weighted with the square of edge length (= area)
			
 
				+	// this equalizes the units with the positional error (which, after normalization, is a square of distance)
			
 
				+	// as a result, a change in weighted attribute of 1 along distance d is approximately equivalent to a change in position of d
			
 
				+	float w = area;
			
 
				+
			
 
				+	// we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
			
 
				+	// v = (d11 * d20 - d01 * d21) / denom
			
 
				+	// w = (d00 * d21 - d01 * d20) / denom
			
 
				+	// u = 1 - v - w
			
 
				+	// here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
			
 
				+	// note: v2 and d20/d21 can not be evaluated here as v2 is effectively an unknown variable; we need these only as variables for derivation of gradients
			
 
				+	const Vector3& v0 = p10;
			
 
				+	const Vector3& v1 = p20;
			
 
				+	float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z;
			
 
				+	float d01 = v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
			
 
				+	float d11 = v1.x * v1.x + v1.y * v1.y + v1.z * v1.z;
			
 
				+	float denom = d00 * d11 - d01 * d01;
			
 
				+	float denomr = denom == 0 ? 0.f : 1.f / denom;
			
 
				+
			
 
				+	// precompute gradient factors
			
 
				+	// these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out expressions that are shared between attributes
			
 
				+	float gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
			
 
				+	float gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
			
 
				+	float gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
			
 
				+	float gy2 = (d00 * v1.y - d01 * v0.y) * denomr;
			
 
				+	float gz1 = (d11 * v0.z - d01 * v1.z) * denomr;
			
 
				+	float gz2 = (d00 * v1.z - d01 * v0.z) * denomr;
			
 
				+
			
 
				+	memset(&Q, 0, sizeof(Quadric));
			
 
				+
			
 
				+	Q.w = w;
			
 
				+
			
 
				+	for (size_t k = 0; k < attribute_count; ++k)
			
 
				+	{
			
 
				+		float a0 = va0[k], a1 = va1[k], a2 = va2[k];
			
 
				+
			
 
				+		// compute gradient of eval(pos) for x/y/z/w
			
 
				+		// the formulas below are obtained by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w
			
 
				+		float gx = gx1 * (a1 - a0) + gx2 * (a2 - a0);
			
 
				+		float gy = gy1 * (a1 - a0) + gy2 * (a2 - a0);
			
 
				+		float gz = gz1 * (a1 - a0) + gz2 * (a2 - a0);
			
 
				+		float gw = a0 - p0.x * gx - p0.y * gy - p0.z * gz;
			
 
				+
			
 
				+		// quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
			
 
				+		// since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
			
 
				+		// note: for simplicity we scale all factors by weight here instead of outside the loop
			
 
				+		Q.a00 += w * (gx * gx);
			
 
				+		Q.a11 += w * (gy * gy);
			
 
				+		Q.a22 += w * (gz * gz);
			
 
				+
			
 
				+		Q.a10 += w * (gy * gx);
			
 
				+		Q.a20 += w * (gz * gx);
			
 
				+		Q.a21 += w * (gz * gy);
			
 
				+
			
 
				+		Q.b0 += w * (gx * gw);
			
 
				+		Q.b1 += w * (gy * gw);
			
 
				+		Q.b2 += w * (gz * gw);
			
 
				+
			
 
				+		Q.c += w * (gw * gw);
			
 
				+
			
 
				+		// the only remaining sum components are ones that depend on attr; these will be addded during error evaluation, see quadricError
			
 
				+		G[k].gx = w * gx;
			
 
				+		G[k].gy = w * gy;
			
 
				+		G[k].gz = w * gz;
			
 
				+		G[k].gw = w * gw;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
			
 
				+{
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int i0 = indices[i + 0];
			
 
				+		unsigned int i1 = indices[i + 1];
			
 
				+		unsigned int i2 = indices[i + 2];
			
 
				+
			
 
				+		Quadric Q;
			
 
				+		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
			
 
				+
			
 
				+		quadricAdd(vertex_quadrics[remap[i0]], Q);
			
 
				+		quadricAdd(vertex_quadrics[remap[i1]], Q);
			
 
				+		quadricAdd(vertex_quadrics[remap[i2]], Q);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
			
 
				+{
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		static const int next[4] = {1, 2, 0, 1};
			
 
				+
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+
			
 
				+			unsigned char k0 = vertex_kind[i0];
			
 
				+			unsigned char k1 = vertex_kind[i1];
			
 
				+
			
 
				+			// check that either i0 or i1 are border/seam and are on the same edge loop
			
 
				+			// note that we need to add the error even for edged that connect e.g. border & locked
			
 
				+			// if we don't do that, the adjacent border->border edge won't have correct errors for corners
			
 
				+			if (k0 != Kind_Border && k0 != Kind_Seam && k1 != Kind_Border && k1 != Kind_Seam)
			
 
				+				continue;
			
 
				+
			
 
				+			if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
			
 
				+				continue;
			
 
				+
			
 
				+			if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
			
 
				+				continue;
			
 
				+
			
 
				+			// seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
			
 
				+			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
			
 
				+				continue;
			
 
				+
			
 
				+			unsigned int i2 = indices[i + next[e + 1]];
			
 
				+
			
 
				+			// we try hard to maintain border edge geometry; seam edges can move more freely
			
 
				+			// due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
			
 
				+			const float kEdgeWeightSeam = 1.f;
			
 
				+			const float kEdgeWeightBorder = 10.f;
			
 
				+
			
 
				+			float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam;
			
 
				+
			
 
				+			Quadric Q;
			
 
				+			quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
			
 
				+
			
 
				+			quadricAdd(vertex_quadrics[remap[i0]], Q);
			
 
				+			quadricAdd(vertex_quadrics[remap[i1]], Q);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int i0 = indices[i + 0];
			
 
				+		unsigned int i1 = indices[i + 1];
			
 
				+		unsigned int i2 = indices[i + 2];
			
 
				+
			
 
				+		Quadric QA;
			
 
				+		QuadricGrad G[kMaxAttributes];
			
 
				+		quadricFromAttributes(QA, G, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], &vertex_attributes[i0 * attribute_count], &vertex_attributes[i1 * attribute_count], &vertex_attributes[i2 * attribute_count], attribute_count);
			
 
				+
			
 
				+		quadricAdd(attribute_quadrics[i0], QA);
			
 
				+		quadricAdd(attribute_quadrics[i1], QA);
			
 
				+		quadricAdd(attribute_quadrics[i2], QA);
			
 
				+
			
 
				+		quadricAdd(&attribute_gradients[i0 * attribute_count], G, attribute_count);
			
 
				+		quadricAdd(&attribute_gradients[i1 * attribute_count], G, attribute_count);
			
 
				+		quadricAdd(&attribute_gradients[i2 * attribute_count], G, attribute_count);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// does triangle ABC flip when C is replaced with D?
			
 
				+static bool hasTriangleFlip(const Vector3& a, const Vector3& b, const Vector3& c, const Vector3& d)
			
 
				+{
			
 
				+	Vector3 eb = {b.x - a.x, b.y - a.y, b.z - a.z};
			
 
				+	Vector3 ec = {c.x - a.x, c.y - a.y, c.z - a.z};
			
 
				+	Vector3 ed = {d.x - a.x, d.y - a.y, d.z - a.z};
			
 
				+
			
 
				+	Vector3 nbc = {eb.y * ec.z - eb.z * ec.y, eb.z * ec.x - eb.x * ec.z, eb.x * ec.y - eb.y * ec.x};
			
 
				+	Vector3 nbd = {eb.y * ed.z - eb.z * ed.y, eb.z * ed.x - eb.x * ed.z, eb.x * ed.y - eb.y * ed.x};
			
 
				+
			
 
				+	float ndp = nbc.x * nbd.x + nbc.y * nbd.y + nbc.z * nbd.z;
			
 
				+	float abc = nbc.x * nbc.x + nbc.y * nbc.y + nbc.z * nbc.z;
			
 
				+	float abd = nbd.x * nbd.x + nbd.y * nbd.y + nbd.z * nbd.z;
			
 
				+
			
 
				+	// scale is cos(angle); somewhat arbitrarily set to ~75 degrees
			
 
				+	// note that the "pure" check is ndp <= 0 (90 degree cutoff) but that allows flipping through a series of close-to-90 collapses
			
 
				+	return ndp <= 0.25f * sqrtf(abc * abd);
			
 
				+}
			
 
				+
			
 
				+static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, const unsigned int* collapse_remap, unsigned int i0, unsigned int i1)
			
 
				+{
			
 
				+	assert(collapse_remap[i0] == i0);
			
 
				+	assert(collapse_remap[i1] == i1);
			
 
				+
			
 
				+	const Vector3& v0 = vertex_positions[i0];
			
 
				+	const Vector3& v1 = vertex_positions[i1];
			
 
				+
			
 
				+	const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]];
			
 
				+	size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0];
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = collapse_remap[edges[i].next];
			
 
				+		unsigned int b = collapse_remap[edges[i].prev];
			
 
				+
			
 
				+		// skip triangles that will get collapsed by i0->i1 collapse or already got collapsed previously
			
 
				+		if (a == i1 || b == i1 || a == b)
			
 
				+			continue;
			
 
				+
			
 
				+		// early-out when at least one triangle flips due to a collapse
			
 
				+		if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1))
			
 
				+		{
			
 
				+#if TRACE >= 2
			
 
				+			printf("edge block %d -> %d: flip welded %d %d %d\n", i0, i1, a, i0, b);
			
 
				+#endif
			
 
				+
			
 
				+			return true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind)
			
 
				+{
			
 
				+	size_t dual_count = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned char k = vertex_kind[i];
			
 
				+		unsigned int e = adjacency.offsets[i + 1] - adjacency.offsets[i];
			
 
				+
			
 
				+		dual_count += (k == Kind_Manifold || k == Kind_Seam) ? e : 0;
			
 
				+	}
			
 
				+
			
 
				+	assert(dual_count <= index_count);
			
 
				+
			
 
				+	// pad capacity by 3 so that we can check for overflow once per triangle instead of once per edge
			
 
				+	return (index_count - dual_count / 2) + 3;
			
 
				+}
			
 
				+
			
 
				+static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
			
 
				+{
			
 
				+	size_t collapse_count = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		static const int next[3] = {1, 2, 0};
			
 
				+
			
 
				+		// this should never happen as boundEdgeCollapses should give an upper bound for the collapse count, but in an unlikely event it does we can just drop extra collapses
			
 
				+		if (collapse_count + 3 > collapse_capacity)
			
 
				+			break;
			
 
				+
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+
			
 
				+			// this can happen either when input has a zero-length edge, or when we perform collapses for complex
			
 
				+			// topology w/seams and collapse a manifold vertex that connects to both wedges onto one of them
			
 
				+			// we leave edges like this alone since they may be important for preserving mesh integrity
			
 
				+			if (remap[i0] == remap[i1])
			
 
				+				continue;
			
 
				+
			
 
				+			unsigned char k0 = vertex_kind[i0];
			
 
				+			unsigned char k1 = vertex_kind[i1];
			
 
				+
			
 
				+			// the edge has to be collapsible in at least one direction
			
 
				+			if (!(kCanCollapse[k0][k1] | kCanCollapse[k1][k0]))
			
 
				+				continue;
			
 
				+
			
 
				+			// manifold and seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
			
 
				+			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
			
 
				+				continue;
			
 
				+
			
 
				+			// two vertices are on a border or a seam, but there's no direct edge between them
			
 
				+			// this indicates that they belong to two different edge loops and we should not collapse this edge
			
 
				+			// loop[] tracks half edges so we only need to check i0->i1
			
 
				+			if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
			
 
				+				continue;
			
 
				+
			
 
				+			if (k0 == Kind_Locked || k1 == Kind_Locked)
			
 
				+			{
			
 
				+				// the same check as above, but for border/seam -> locked collapses
			
 
				+				// loop[] and loopback[] track half edges so we only need to check one of them
			
 
				+				if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
			
 
				+					continue;
			
 
				+				if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+			// edge can be collapsed in either direction - we will pick the one with minimum error
			
 
				+			// note: we evaluate error later during collapse ranking, here we just tag the edge as bidirectional
			
 
				+			if (kCanCollapse[k0][k1] & kCanCollapse[k1][k0])
			
 
				+			{
			
 
				+				Collapse c = {i0, i1, {/* bidi= */ 1}};
			
 
				+				collapses[collapse_count++] = c;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// edge can only be collapsed in one direction
			
 
				+				unsigned int e0 = kCanCollapse[k0][k1] ? i0 : i1;
			
 
				+				unsigned int e1 = kCanCollapse[k0][k1] ? i1 : i0;
			
 
				+
			
 
				+				Collapse c = {e0, e1, {/* bidi= */ 0}};
			
 
				+				collapses[collapse_count++] = c;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return collapse_count;
			
 
				+}
			
 
				+
			
 
				+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap)
			
 
				+{
			
 
				+	for (size_t i = 0; i < collapse_count; ++i)
			
 
				+	{
			
 
				+		Collapse& c = collapses[i];
			
 
				+
			
 
				+		unsigned int i0 = c.v0;
			
 
				+		unsigned int i1 = c.v1;
			
 
				+
			
 
				+		// most edges are bidirectional which means we need to evaluate errors for two collapses
			
 
				+		// to keep this code branchless we just use the same edge for unidirectional edges
			
 
				+		unsigned int j0 = c.bidi ? i1 : i0;
			
 
				+		unsigned int j1 = c.bidi ? i0 : i1;
			
 
				+
			
 
				+		float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
			
 
				+		float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
			
 
				+
			
 
				+#if TRACE >= 3
			
 
				+		float di = ei, dj = ej;
			
 
				+#endif
			
 
				+
			
 
				+		if (attribute_count)
			
 
				+		{
			
 
				+			// note: ideally we would evaluate max/avg of attribute errors for seam edges, but it's not clear if it's worth the extra cost
			
 
				+			ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
			
 
				+			ej += quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]);
			
 
				+		}
			
 
				+
			
 
				+		// pick edge direction with minimal error
			
 
				+		c.v0 = ei <= ej ? i0 : j0;
			
 
				+		c.v1 = ei <= ej ? i1 : j1;
			
 
				+		c.error = ei <= ej ? ei : ej;
			
 
				+
			
 
				+#if TRACE >= 3
			
 
				+		if (i0 == j0) // c.bidi has been overwritten
			
 
				+			printf("edge eval %d -> %d: error %f (pos %f, attr %f)\n", c.v0, c.v1,
			
 
				+			    sqrtf(c.error), sqrtf(ei <= ej ? di : dj), sqrtf(ei <= ej ? ei - di : ej - dj));
			
 
				+		else
			
 
				+			printf("edge eval %d -> %d: error %f (pos %f, attr %f); reverse %f (pos %f, attr %f)\n", c.v0, c.v1,
			
 
				+			    sqrtf(ei <= ej ? ei : ej), sqrtf(ei <= ej ? di : dj), sqrtf(ei <= ej ? ei - di : ej - dj),
			
 
				+			    sqrtf(ei <= ej ? ej : ei), sqrtf(ei <= ej ? dj : di), sqrtf(ei <= ej ? ej - dj : ei - di));
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
			
 
				+{
			
 
				+	// we use counting sort to order collapses by error; since the exact sort order is not as critical,
			
 
				+	// only top 12 bits of exponent+mantissa (8 bits of exponent and 4 bits of mantissa) are used.
			
 
				+	// to avoid excessive stack usage, we clamp the exponent range as collapses with errors much higher than 1 are not useful.
			
 
				+	const unsigned int sort_bits = 12;
			
 
				+	const unsigned int sort_bins = 2048 + 512; // exponent range [-127, 32)
			
 
				+
			
 
				+	// fill histogram for counting sort
			
 
				+	unsigned int histogram[sort_bins];
			
 
				+	memset(histogram, 0, sizeof(histogram));
			
 
				+
			
 
				+	for (size_t i = 0; i < collapse_count; ++i)
			
 
				+	{
			
 
				+		// skip sign bit since error is non-negative
			
 
				+		unsigned int error = collapses[i].errorui;
			
 
				+		unsigned int key = (error << 1) >> (32 - sort_bits);
			
 
				+		key = key < sort_bins ? key : sort_bins - 1;
			
 
				+
			
 
				+		histogram[key]++;
			
 
				+	}
			
 
				+
			
 
				+	// compute offsets based on histogram data
			
 
				+	size_t histogram_sum = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < sort_bins; ++i)
			
 
				+	{
			
 
				+		size_t count = histogram[i];
			
 
				+		histogram[i] = unsigned(histogram_sum);
			
 
				+		histogram_sum += count;
			
 
				+	}
			
 
				+
			
 
				+	assert(histogram_sum == collapse_count);
			
 
				+
			
 
				+	// compute sort order based on offsets
			
 
				+	for (size_t i = 0; i < collapse_count; ++i)
			
 
				+	{
			
 
				+		// skip sign bit since error is non-negative
			
 
				+		unsigned int error = collapses[i].errorui;
			
 
				+		unsigned int key = (error << 1) >> (32 - sort_bits);
			
 
				+		key = key < sort_bins ? key : sort_bins - 1;
			
 
				+
			
 
				+		sort_order[histogram[key]++] = unsigned(i);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
			
 
				+{
			
 
				+	size_t edge_collapses = 0;
			
 
				+	size_t triangle_collapses = 0;
			
 
				+
			
 
				+	// most collapses remove 2 triangles; use this to establish a bound on the pass in terms of error limit
			
 
				+	// note that edge_collapse_goal is an estimate; triangle_collapse_goal will be used to actually limit collapses
			
 
				+	size_t edge_collapse_goal = triangle_collapse_goal / 2;
			
 
				+
			
 
				+#if TRACE
			
 
				+	size_t stats[7] = {};
			
 
				+#endif
			
 
				+
			
 
				+	for (size_t i = 0; i < collapse_count; ++i)
			
 
				+	{
			
 
				+		const Collapse& c = collapses[collapse_order[i]];
			
 
				+
			
 
				+		TRACESTATS(0);
			
 
				+
			
 
				+		if (c.error > error_limit)
			
 
				+		{
			
 
				+			TRACESTATS(4);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (triangle_collapses >= triangle_collapse_goal)
			
 
				+		{
			
 
				+			TRACESTATS(5);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		// we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked
			
 
				+		// as they will share vertices with other successfull collapses, we need to increase the acceptable error by some factor
			
 
				+		float error_goal = edge_collapse_goal < collapse_count ? 1.5f * collapses[collapse_order[edge_collapse_goal]].error : FLT_MAX;
			
 
				+
			
 
				+		// on average, each collapse is expected to lock 6 other collapses; to avoid degenerate passes on meshes with odd
			
 
				+		// topology, we only abort if we got over 1/6 collapses accordingly.
			
 
				+		if (c.error > error_goal && c.error > result_error && triangle_collapses > triangle_collapse_goal / 6)
			
 
				+		{
			
 
				+			TRACESTATS(6);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		unsigned int i0 = c.v0;
			
 
				+		unsigned int i1 = c.v1;
			
 
				+
			
 
				+		unsigned int r0 = remap[i0];
			
 
				+		unsigned int r1 = remap[i1];
			
 
				+
			
 
				+		unsigned char kind = vertex_kind[i0];
			
 
				+
			
 
				+		// we don't collapse vertices that had source or target vertex involved in a collapse
			
 
				+		// it's important to not move the vertices twice since it complicates the tracking/remapping logic
			
 
				+		// it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass
			
 
				+		if (collapse_locked[r0] | collapse_locked[r1])
			
 
				+		{
			
 
				+			TRACESTATS(1);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (hasTriangleFlips(adjacency, vertex_positions, collapse_remap, r0, r1))
			
 
				+		{
			
 
				+			// adjust collapse goal since this collapse is invalid and shouldn't factor into error goal
			
 
				+			edge_collapse_goal++;
			
 
				+
			
 
				+			TRACESTATS(2);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+#if TRACE >= 2
			
 
				+		printf("edge commit %d -> %d: kind %d->%d, error %f\n", i0, i1, vertex_kind[i0], vertex_kind[i1], sqrtf(c.error));
			
 
				+#endif
			
 
				+
			
 
				+		assert(collapse_remap[r0] == r0);
			
 
				+		assert(collapse_remap[r1] == r1);
			
 
				+
			
 
				+		if (kind == Kind_Complex)
			
 
				+		{
			
 
				+			// remap all vertices in the complex to the target vertex
			
 
				+			unsigned int v = i0;
			
 
				+
			
 
				+			do
			
 
				+			{
			
 
				+				collapse_remap[v] = i1;
			
 
				+				v = wedge[v];
			
 
				+			} while (v != i0);
			
 
				+		}
			
 
				+		else if (kind == Kind_Seam)
			
 
				+		{
			
 
				+			// for seam collapses we need to move the seam pair together; this is a bit tricky to compute since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
			
 
				+			unsigned int s0 = wedge[i0];
			
 
				+			unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
			
 
				+			assert(s0 != i0 && wedge[s0] == i0);
			
 
				+			assert(s1 != ~0u && remap[s1] == r1);
			
 
				+
			
 
				+			// additional asserts to verify that the seam pair is consistent
			
 
				+			assert(kind != vertex_kind[i1] || s1 == wedge[i1]);
			
 
				+			assert(loop[i0] == i1 || loopback[i0] == i1);
			
 
				+			assert(loop[s0] == s1 || loopback[s0] == s1);
			
 
				+
			
 
				+			// note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
			
 
				+			s1 = (s1 != ~0u) ? s1 : wedge[i1];
			
 
				+
			
 
				+			collapse_remap[i0] = i1;
			
 
				+			collapse_remap[s0] = s1;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			assert(wedge[i0] == i0);
			
 
				+
			
 
				+			collapse_remap[i0] = i1;
			
 
				+		}
			
 
				+
			
 
				+		// note: we technically don't need to lock r1 if it's a locked vertex, as it can't move and its quadric won't be used
			
 
				+		// however, this results in slightly worse error on some meshes because the locked collapses get an unfair advantage wrt scheduling
			
 
				+		collapse_locked[r0] = 1;
			
 
				+		collapse_locked[r1] = 1;
			
 
				+
			
 
				+		// border edges collapse 1 triangle, other edges collapse 2 or more
			
 
				+		triangle_collapses += (kind == Kind_Border) ? 1 : 2;
			
 
				+		edge_collapses++;
			
 
				+
			
 
				+		result_error = result_error < c.error ? c.error : result_error;
			
 
				+	}
			
 
				+
			
 
				+#if TRACE
			
 
				+	float error_goal_last = edge_collapse_goal < collapse_count ? 1.5f * collapses[collapse_order[edge_collapse_goal]].error : FLT_MAX;
			
 
				+	float error_goal_limit = error_goal_last < error_limit ? error_goal_last : error_limit;
			
 
				+
			
 
				+	printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d); %s\n",
			
 
				+	    int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_limit),
			
 
				+	    int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2]),
			
 
				+	    stats[4] ? "error limit" : (stats[5] ? "count limit" : (stats[6] ? "error goal" : "out of collapses")));
			
 
				+#endif
			
 
				+
			
 
				+	return edge_collapses;
			
 
				+}
			
 
				+
			
 
				+static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_count, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Vector3* vertex_positions, const unsigned int* remap, float& vertex_error)
			
 
				+{
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		if (collapse_remap[i] == i)
			
 
				+			continue;
			
 
				+
			
 
				+		unsigned int i0 = unsigned(i);
			
 
				+		unsigned int i1 = collapse_remap[i];
			
 
				+
			
 
				+		unsigned int r0 = remap[i0];
			
 
				+		unsigned int r1 = remap[i1];
			
 
				+
			
 
				+		// ensure we only update vertex_quadrics once: primary vertex must be moved if any wedge is moved
			
 
				+		if (i0 == r0)
			
 
				+			quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
			
 
				+
			
 
				+		if (attribute_count)
			
 
				+		{
			
 
				+			quadricAdd(attribute_quadrics[i1], attribute_quadrics[i0]);
			
 
				+			quadricAdd(&attribute_gradients[i1 * attribute_count], &attribute_gradients[i0 * attribute_count], attribute_count);
			
 
				+
			
 
				+			if (i0 == r0)
			
 
				+			{
			
 
				+				// when attributes are used, distance error needs to be recomputed as collapses don't track it; it is safe to do this after the quadric adjustment
			
 
				+				float derr = quadricError(vertex_quadrics[r0], vertex_positions[r1]);
			
 
				+				vertex_error = vertex_error < derr ? derr : vertex_error;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
			
 
				+{
			
 
				+	size_t write = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int v0 = collapse_remap[indices[i + 0]];
			
 
				+		unsigned int v1 = collapse_remap[indices[i + 1]];
			
 
				+		unsigned int v2 = collapse_remap[indices[i + 2]];
			
 
				+
			
 
				+		// we never move the vertex twice during a single pass
			
 
				+		assert(collapse_remap[v0] == v0);
			
 
				+		assert(collapse_remap[v1] == v1);
			
 
				+		assert(collapse_remap[v2] == v2);
			
 
				+
			
 
				+		if (v0 != v1 && v0 != v2 && v1 != v2)
			
 
				+		{
			
 
				+			indices[write + 0] = v0;
			
 
				+			indices[write + 1] = v1;
			
 
				+			indices[write + 2] = v2;
			
 
				+			write += 3;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return write;
			
 
				+}
			
 
				+
			
 
				+static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsigned int* collapse_remap)
			
 
				+{
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		// note: this is a no-op for vertices that were remapped
			
 
				+		// ideally we would clear the loop entries for those for consistency, even though they aren't going to be used
			
 
				+		// however, the remapping process needs loop information for remapped vertices, so this would require a separate pass
			
 
				+		if (loop[i] != ~0u)
			
 
				+		{
			
 
				+			unsigned int l = loop[i];
			
 
				+			unsigned int r = collapse_remap[l];
			
 
				+
			
 
				+			// i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes
			
 
				+			if (i == r)
			
 
				+				loop[i] = (loop[l] != ~0u) ? collapse_remap[loop[l]] : ~0u;
			
 
				+			else
			
 
				+				loop[i] = r;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned int follow(unsigned int* parents, unsigned int index)
			
 
				+{
			
 
				+	while (index != parents[index])
			
 
				+	{
			
 
				+		unsigned int parent = parents[index];
			
 
				+		parents[index] = parents[parent];
			
 
				+		index = parent;
			
 
				+	}
			
 
				+
			
 
				+	return index;
			
 
				+}
			
 
				+
			
 
				+static size_t buildComponents(unsigned int* components, size_t vertex_count, const unsigned int* indices, size_t index_count, const unsigned int* remap)
			
 
				+{
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		components[i] = unsigned(i);
			
 
				+
			
 
				+	// compute a unique (but not sequential!) index for each component via union-find
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		static const int next[4] = {1, 2, 0, 1};
			
 
				+
			
 
				+		for (int e = 0; e < 3; ++e)
			
 
				+		{
			
 
				+			unsigned int i0 = indices[i + e];
			
 
				+			unsigned int i1 = indices[i + next[e]];
			
 
				+
			
 
				+			unsigned int r0 = remap[i0];
			
 
				+			unsigned int r1 = remap[i1];
			
 
				+
			
 
				+			r0 = follow(components, r0);
			
 
				+			r1 = follow(components, r1);
			
 
				+
			
 
				+			// merge components with larger indices into components with smaller indices
			
 
				+			// this guarantees that the root of the component is always the one with the smallest index
			
 
				+			if (r0 != r1)
			
 
				+				components[r0 < r1 ? r1 : r0] = r0 < r1 ? r0 : r1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// make sure each element points to the component root *before* we renumber the components
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		if (remap[i] == i)
			
 
				+			components[i] = follow(components, unsigned(i));
			
 
				+
			
 
				+	unsigned int next_component = 0;
			
 
				+
			
 
				+	// renumber components using sequential indices
			
 
				+	// a sequential pass is sufficient because component root always has the smallest index
			
 
				+	// note: it is unsafe to use follow() in this pass because we're replacing component links with sequential indices inplace
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		if (remap[i] == i)
			
 
				+		{
			
 
				+			unsigned int root = components[i];
			
 
				+			assert(root <= i); // make sure we already computed the component for non-roots
			
 
				+			components[i] = (root == i) ? next_component++ : components[root];
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			assert(remap[i] < i); // make sure we already computed the component
			
 
				+			components[i] = components[remap[i]];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return next_component;
			
 
				+}
			
 
				+
			
 
				+static void measureComponents(float* component_errors, size_t component_count, const unsigned int* components, const Vector3* vertex_positions, size_t vertex_count)
			
 
				+{
			
 
				+	memset(component_errors, 0, component_count * 4 * sizeof(float));
			
 
				+
			
 
				+	// compute approximate sphere center for each component as an average
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int c = components[i];
			
 
				+		assert(components[i] < component_count);
			
 
				+
			
 
				+		Vector3 v = vertex_positions[i]; // copy avoids aliasing issues
			
 
				+
			
 
				+		component_errors[c * 4 + 0] += v.x;
			
 
				+		component_errors[c * 4 + 1] += v.y;
			
 
				+		component_errors[c * 4 + 2] += v.z;
			
 
				+		component_errors[c * 4 + 3] += 1; // weight
			
 
				+	}
			
 
				+
			
 
				+	// complete the center computation, and reinitialize [3] as a radius
			
 
				+	for (size_t i = 0; i < component_count; ++i)
			
 
				+	{
			
 
				+		float w = component_errors[i * 4 + 3];
			
 
				+		float iw = w == 0.f ? 0.f : 1.f / w;
			
 
				+
			
 
				+		component_errors[i * 4 + 0] *= iw;
			
 
				+		component_errors[i * 4 + 1] *= iw;
			
 
				+		component_errors[i * 4 + 2] *= iw;
			
 
				+		component_errors[i * 4 + 3] = 0; // radius
			
 
				+	}
			
 
				+
			
 
				+	// compute squared radius for each component
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int c = components[i];
			
 
				+
			
 
				+		float dx = vertex_positions[i].x - component_errors[c * 4 + 0];
			
 
				+		float dy = vertex_positions[i].y - component_errors[c * 4 + 1];
			
 
				+		float dz = vertex_positions[i].z - component_errors[c * 4 + 2];
			
 
				+		float r = dx * dx + dy * dy + dz * dz;
			
 
				+
			
 
				+		component_errors[c * 4 + 3] = component_errors[c * 4 + 3] < r ? r : component_errors[c * 4 + 3];
			
 
				+	}
			
 
				+
			
 
				+	// we've used the output buffer as scratch space, so we need to move the results to proper indices
			
 
				+	for (size_t i = 0; i < component_count; ++i)
			
 
				+	{
			
 
				+#if TRACE >= 2
			
 
				+		printf("component %d: center %f %f %f, error %e\n", int(i),
			
 
				+		    component_errors[i * 4 + 0], component_errors[i * 4 + 1], component_errors[i * 4 + 2], sqrtf(component_errors[i * 4 + 3]));
			
 
				+#endif
			
 
				+		// note: we keep the squared error to make it match quadric error metric
			
 
				+		component_errors[i] = component_errors[i * 4 + 3];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static size_t pruneComponents(unsigned int* indices, size_t index_count, const unsigned int* components, const float* component_errors, size_t component_count, float error_cutoff, float& nexterror)
			
 
				+{
			
 
				+	size_t write = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int c = components[indices[i]];
			
 
				+		assert(c == components[indices[i + 1]] && c == components[indices[i + 2]]);
			
 
				+
			
 
				+		if (component_errors[c] > error_cutoff)
			
 
				+		{
			
 
				+			indices[write + 0] = indices[i + 0];
			
 
				+			indices[write + 1] = indices[i + 1];
			
 
				+			indices[write + 2] = indices[i + 2];
			
 
				+			write += 3;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+#if TRACE
			
 
				+	size_t pruned_components = 0;
			
 
				+	for (size_t i = 0; i < component_count; ++i)
			
 
				+		pruned_components += (component_errors[i] >= nexterror && component_errors[i] <= error_cutoff);
			
 
				+
			
 
				+	printf("pruned %d triangles in %d components (goal %e)\n", int((index_count - write) / 3), int(pruned_components), sqrtf(error_cutoff));
			
 
				+#endif
			
 
				+
			
 
				+	// update next error with the smallest error of the remaining components for future pruning
			
 
				+	nexterror = FLT_MAX;
			
 
				+	for (size_t i = 0; i < component_count; ++i)
			
 
				+		if (component_errors[i] > error_cutoff)
			
 
				+			nexterror = nexterror > component_errors[i] ? component_errors[i] : nexterror;
			
 
				+
			
 
				+	return write;
			
 
				+}
			
 
				+
			
 
				+struct CellHasher
			
 
				+{
			
 
				+	const unsigned int* vertex_ids;
			
 
				+
			
 
				+	size_t hash(unsigned int i) const
			
 
				+	{
			
 
				+		unsigned int h = vertex_ids[i];
			
 
				+
			
 
				+		// MurmurHash2 finalizer
			
 
				+		h ^= h >> 13;
			
 
				+		h *= 0x5bd1e995;
			
 
				+		h ^= h >> 15;
			
 
				+		return h;
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned int lhs, unsigned int rhs) const
			
 
				+	{
			
 
				+		return vertex_ids[lhs] == vertex_ids[rhs];
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+struct IdHasher
			
 
				+{
			
 
				+	size_t hash(unsigned int id) const
			
 
				+	{
			
 
				+		unsigned int h = id;
			
 
				+
			
 
				+		// MurmurHash2 finalizer
			
 
				+		h ^= h >> 13;
			
 
				+		h *= 0x5bd1e995;
			
 
				+		h ^= h >> 15;
			
 
				+		return h;
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned int lhs, unsigned int rhs) const
			
 
				+	{
			
 
				+		return lhs == rhs;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+struct TriangleHasher
			
 
				+{
			
 
				+	const unsigned int* indices;
			
 
				+
			
 
				+	size_t hash(unsigned int i) const
			
 
				+	{
			
 
				+		const unsigned int* tri = indices + i * 3;
			
 
				+
			
 
				+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
			
 
				+		return (tri[0] * 73856093) ^ (tri[1] * 19349663) ^ (tri[2] * 83492791);
			
 
				+	}
			
 
				+
			
 
				+	bool equal(unsigned int lhs, unsigned int rhs) const
			
 
				+	{
			
 
				+		const unsigned int* lt = indices + lhs * 3;
			
 
				+		const unsigned int* rt = indices + rhs * 3;
			
 
				+
			
 
				+		return lt[0] == rt[0] && lt[1] == rt[1] && lt[2] == rt[2];
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size)
			
 
				+{
			
 
				+	assert(grid_size >= 1 && grid_size <= 1024);
			
 
				+	float cell_scale = float(grid_size - 1);
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		const Vector3& v = vertex_positions[i];
			
 
				+
			
 
				+		int xi = int(v.x * cell_scale + 0.5f);
			
 
				+		int yi = int(v.y * cell_scale + 0.5f);
			
 
				+		int zi = int(v.z * cell_scale + 0.5f);
			
 
				+
			
 
				+		vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static size_t countTriangles(const unsigned int* vertex_ids, const unsigned int* indices, size_t index_count)
			
 
				+{
			
 
				+	size_t result = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int id0 = vertex_ids[indices[i + 0]];
			
 
				+		unsigned int id1 = vertex_ids[indices[i + 1]];
			
 
				+		unsigned int id2 = vertex_ids[indices[i + 2]];
			
 
				+
			
 
				+		result += (id0 != id1) & (id0 != id2) & (id1 != id2);
			
 
				+	}
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static size_t fillVertexCells(unsigned int* table, size_t table_size, unsigned int* vertex_cells, const unsigned int* vertex_ids, size_t vertex_count)
			
 
				+{
			
 
				+	CellHasher hasher = {vertex_ids};
			
 
				+
			
 
				+	memset(table, -1, table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	size_t result = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int* entry = hashLookup2(table, table_size, hasher, unsigned(i), ~0u);
			
 
				+
			
 
				+		if (*entry == ~0u)
			
 
				+		{
			
 
				+			*entry = unsigned(i);
			
 
				+			vertex_cells[i] = unsigned(result++);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			vertex_cells[i] = vertex_cells[*entry];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static size_t countVertexCells(unsigned int* table, size_t table_size, const unsigned int* vertex_ids, size_t vertex_count)
			
 
				+{
			
 
				+	IdHasher hasher;
			
 
				+
			
 
				+	memset(table, -1, table_size * sizeof(unsigned int));
			
 
				+
			
 
				+	size_t result = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int id = vertex_ids[i];
			
 
				+		unsigned int* entry = hashLookup2(table, table_size, hasher, id, ~0u);
			
 
				+
			
 
				+		result += (*entry == ~0u);
			
 
				+		*entry = id;
			
 
				+	}
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static void fillCellQuadrics(Quadric* cell_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* vertex_cells)
			
 
				+{
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int i0 = indices[i + 0];
			
 
				+		unsigned int i1 = indices[i + 1];
			
 
				+		unsigned int i2 = indices[i + 2];
			
 
				+
			
 
				+		unsigned int c0 = vertex_cells[i0];
			
 
				+		unsigned int c1 = vertex_cells[i1];
			
 
				+		unsigned int c2 = vertex_cells[i2];
			
 
				+
			
 
				+		int single_cell = (c0 == c1) & (c0 == c2);
			
 
				+
			
 
				+		Quadric Q;
			
 
				+		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], single_cell ? 3.f : 1.f);
			
 
				+
			
 
				+		if (single_cell)
			
 
				+		{
			
 
				+			quadricAdd(cell_quadrics[c0], Q);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			quadricAdd(cell_quadrics[c0], Q);
			
 
				+			quadricAdd(cell_quadrics[c1], Q);
			
 
				+			quadricAdd(cell_quadrics[c2], Q);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void fillCellReservoirs(Reservoir* cell_reservoirs, size_t cell_count, const Vector3* vertex_positions, const float* vertex_colors, size_t vertex_colors_stride, size_t vertex_count, const unsigned int* vertex_cells)
			
 
				+{
			
 
				+	static const float dummy_color[] = {0.f, 0.f, 0.f};
			
 
				+
			
 
				+	size_t vertex_colors_stride_float = vertex_colors_stride / sizeof(float);
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int cell = vertex_cells[i];
			
 
				+		const Vector3& v = vertex_positions[i];
			
 
				+		Reservoir& r = cell_reservoirs[cell];
			
 
				+
			
 
				+		const float* color = vertex_colors ? &vertex_colors[i * vertex_colors_stride_float] : dummy_color;
			
 
				+
			
 
				+		r.x += v.x;
			
 
				+		r.y += v.y;
			
 
				+		r.z += v.z;
			
 
				+		r.r += color[0];
			
 
				+		r.g += color[1];
			
 
				+		r.b += color[2];
			
 
				+		r.w += 1.f;
			
 
				+	}
			
 
				+
			
 
				+	for (size_t i = 0; i < cell_count; ++i)
			
 
				+	{
			
 
				+		Reservoir& r = cell_reservoirs[i];
			
 
				+
			
 
				+		float iw = r.w == 0.f ? 0.f : 1.f / r.w;
			
 
				+
			
 
				+		r.x *= iw;
			
 
				+		r.y *= iw;
			
 
				+		r.z *= iw;
			
 
				+		r.r *= iw;
			
 
				+		r.g *= iw;
			
 
				+		r.b *= iw;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t cell_count, const unsigned int* vertex_cells, const Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count)
			
 
				+{
			
 
				+	memset(cell_remap, -1, cell_count * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int cell = vertex_cells[i];
			
 
				+		float error = quadricError(cell_quadrics[cell], vertex_positions[i]);
			
 
				+
			
 
				+		if (cell_remap[cell] == ~0u || cell_errors[cell] > error)
			
 
				+		{
			
 
				+			cell_remap[cell] = unsigned(i);
			
 
				+			cell_errors[cell] = error;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t cell_count, const unsigned int* vertex_cells, const Reservoir* cell_reservoirs, const Vector3* vertex_positions, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t vertex_count)
			
 
				+{
			
 
				+	static const float dummy_color[] = {0.f, 0.f, 0.f};
			
 
				+
			
 
				+	size_t vertex_colors_stride_float = vertex_colors_stride / sizeof(float);
			
 
				+
			
 
				+	memset(cell_remap, -1, cell_count * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		unsigned int cell = vertex_cells[i];
			
 
				+		const Vector3& v = vertex_positions[i];
			
 
				+		const Reservoir& r = cell_reservoirs[cell];
			
 
				+
			
 
				+		const float* color = vertex_colors ? &vertex_colors[i * vertex_colors_stride_float] : dummy_color;
			
 
				+
			
 
				+		float pos_error = (v.x - r.x) * (v.x - r.x) + (v.y - r.y) * (v.y - r.y) + (v.z - r.z) * (v.z - r.z);
			
 
				+		float col_error = (color[0] - r.r) * (color[0] - r.r) + (color[1] - r.g) * (color[1] - r.g) + (color[2] - r.b) * (color[2] - r.b);
			
 
				+		float error = pos_error + color_weight * col_error;
			
 
				+
			
 
				+		if (cell_remap[cell] == ~0u || cell_errors[cell] > error)
			
 
				+		{
			
 
				+			cell_remap[cell] = unsigned(i);
			
 
				+			cell_errors[cell] = error;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static size_t filterTriangles(unsigned int* destination, unsigned int* tritable, size_t tritable_size, const unsigned int* indices, size_t index_count, const unsigned int* vertex_cells, const unsigned int* cell_remap)
			
 
				+{
			
 
				+	TriangleHasher hasher = {destination};
			
 
				+
			
 
				+	memset(tritable, -1, tritable_size * sizeof(unsigned int));
			
 
				+
			
 
				+	size_t result = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int c0 = vertex_cells[indices[i + 0]];
			
 
				+		unsigned int c1 = vertex_cells[indices[i + 1]];
			
 
				+		unsigned int c2 = vertex_cells[indices[i + 2]];
			
 
				+
			
 
				+		if (c0 != c1 && c0 != c2 && c1 != c2)
			
 
				+		{
			
 
				+			unsigned int a = cell_remap[c0];
			
 
				+			unsigned int b = cell_remap[c1];
			
 
				+			unsigned int c = cell_remap[c2];
			
 
				+
			
 
				+			if (b < a && b < c)
			
 
				+			{
			
 
				+				unsigned int t = a;
			
 
				+				a = b, b = c, c = t;
			
 
				+			}
			
 
				+			else if (c < a && c < b)
			
 
				+			{
			
 
				+				unsigned int t = c;
			
 
				+				c = b, b = a, a = t;
			
 
				+			}
			
 
				+
			
 
				+			destination[result * 3 + 0] = a;
			
 
				+			destination[result * 3 + 1] = b;
			
 
				+			destination[result * 3 + 2] = c;
			
 
				+
			
 
				+			unsigned int* entry = hashLookup2(tritable, tritable_size, hasher, unsigned(result), ~0u);
			
 
				+
			
 
				+			if (*entry == ~0u)
			
 
				+				*entry = unsigned(result++);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return result * 3;
			
 
				+}
			
 
				+
			
 
				+static float interpolate(float y, float x0, float y0, float x1, float y1, float x2, float y2)
			
 
				+{
			
 
				+	// three point interpolation from "revenge of interpolation search" paper
			
 
				+	float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
			
 
				+	float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
			
 
				+	return x1 + num / den;
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+// Note: this is only exposed for debug visualization purposes; do *not* use
			
 
				+enum
			
 
				+{
			
 
				+	meshopt_SimplifyInternalDebug = 1 << 30
			
 
				+};
			
 
				+
			
 
				+size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+	assert(target_index_count <= index_count);
			
 
				+	assert(target_error >= 0);
			
 
				+	assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute | meshopt_SimplifyPrune | meshopt_SimplifyInternalDebug)) == 0);
			
 
				+	assert(vertex_attributes_stride >= attribute_count * sizeof(float) && vertex_attributes_stride <= 256);
			
 
				+	assert(vertex_attributes_stride % sizeof(float) == 0);
			
 
				+	assert(attribute_count <= kMaxAttributes);
			
 
				+	for (size_t i = 0; i < attribute_count; ++i)
			
 
				+		assert(attribute_weights[i] >= 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	unsigned int* result = destination;
			
 
				+	if (result != indices)
			
 
				+		memcpy(result, indices, index_count * sizeof(unsigned int));
			
 
				+
			
 
				+	// build an index remap and update indices/vertex_count to minimize the subsequent work
			
 
				+	// note: as a consequence, errors will be computed relative to the subset extent
			
 
				+	unsigned int* sparse_remap = NULL;
			
 
				+	if (options & meshopt_SimplifySparse)
			
 
				+		sparse_remap = buildSparseRemap(result, index_count, vertex_count, &vertex_count, allocator);
			
 
				+
			
 
				+	// build adjacency information
			
 
				+	EdgeAdjacency adjacency = {};
			
 
				+	prepareEdgeAdjacency(adjacency, index_count, vertex_count, allocator);
			
 
				+	updateEdgeAdjacency(adjacency, result, index_count, vertex_count, NULL);
			
 
				+
			
 
				+	// build position remap that maps each vertex to the one with identical position
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, allocator);
			
 
				+
			
 
				+	// classify vertices; vertex kind determines collapse rules, see kCanCollapse
			
 
				+	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
			
 
				+	unsigned int* loop = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	unsigned int* loopback = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge, vertex_lock, sparse_remap, options);
			
 
				+
			
 
				+#if TRACE
			
 
				+	size_t unique_positions = 0;
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		unique_positions += remap[i] == i;
			
 
				+
			
 
				+	printf("position remap: %d vertices => %d positions\n", int(vertex_count), int(unique_positions));
			
 
				+
			
 
				+	size_t kinds[Kind_Count] = {};
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		kinds[vertex_kind[i]] += remap[i] == i;
			
 
				+
			
 
				+	printf("kinds: manifold %d, border %d, seam %d, complex %d, locked %d\n",
			
 
				+	    int(kinds[Kind_Manifold]), int(kinds[Kind_Border]), int(kinds[Kind_Seam]), int(kinds[Kind_Complex]), int(kinds[Kind_Locked]));
			
 
				+#endif
			
 
				+
			
 
				+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
			
 
				+	float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap);
			
 
				+
			
 
				+	float* vertex_attributes = NULL;
			
 
				+
			
 
				+	if (attribute_count)
			
 
				+	{
			
 
				+		unsigned int attribute_remap[kMaxAttributes];
			
 
				+
			
 
				+		// remap attributes to only include ones with weight > 0 to minimize memory/compute overhead for quadrics
			
 
				+		size_t attributes_used = 0;
			
 
				+		for (size_t i = 0; i < attribute_count; ++i)
			
 
				+			if (attribute_weights[i] > 0)
			
 
				+				attribute_remap[attributes_used++] = unsigned(i);
			
 
				+
			
 
				+		attribute_count = attributes_used;
			
 
				+		vertex_attributes = allocator.allocate<float>(vertex_count * attribute_count);
			
 
				+		rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, attribute_remap, sparse_remap);
			
 
				+	}
			
 
				+
			
 
				+	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
			
 
				+	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
			
 
				+
			
 
				+	Quadric* attribute_quadrics = NULL;
			
 
				+	QuadricGrad* attribute_gradients = NULL;
			
 
				+
			
 
				+	if (attribute_count)
			
 
				+	{
			
 
				+		attribute_quadrics = allocator.allocate<Quadric>(vertex_count);
			
 
				+		memset(attribute_quadrics, 0, vertex_count * sizeof(Quadric));
			
 
				+
			
 
				+		attribute_gradients = allocator.allocate<QuadricGrad>(vertex_count * attribute_count);
			
 
				+		memset(attribute_gradients, 0, vertex_count * attribute_count * sizeof(QuadricGrad));
			
 
				+	}
			
 
				+
			
 
				+	fillFaceQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap);
			
 
				+	fillEdgeQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
			
 
				+
			
 
				+	if (attribute_count)
			
 
				+		fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count);
			
 
				+
			
 
				+	unsigned int* components = NULL;
			
 
				+	float* component_errors = NULL;
			
 
				+	size_t component_count = 0;
			
 
				+	float component_nexterror = 0;
			
 
				+
			
 
				+	if (options & meshopt_SimplifyPrune)
			
 
				+	{
			
 
				+		components = allocator.allocate<unsigned int>(vertex_count);
			
 
				+		component_count = buildComponents(components, vertex_count, result, index_count, remap);
			
 
				+
			
 
				+		component_errors = allocator.allocate<float>(component_count * 4); // overallocate for temporary use inside measureComponents
			
 
				+		measureComponents(component_errors, component_count, components, vertex_positions, vertex_count);
			
 
				+
			
 
				+		component_nexterror = FLT_MAX;
			
 
				+		for (size_t i = 0; i < component_count; ++i)
			
 
				+			component_nexterror = component_nexterror > component_errors[i] ? component_errors[i] : component_nexterror;
			
 
				+
			
 
				+#if TRACE
			
 
				+		printf("components: %d (min error %e)\n", int(component_count), sqrtf(component_nexterror));
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+#if TRACE
			
 
				+	size_t pass_count = 0;
			
 
				+#endif
			
 
				+
			
 
				+	size_t collapse_capacity = boundEdgeCollapses(adjacency, vertex_count, index_count, vertex_kind);
			
 
				+
			
 
				+	Collapse* edge_collapses = allocator.allocate<Collapse>(collapse_capacity);
			
 
				+	unsigned int* collapse_order = allocator.allocate<unsigned int>(collapse_capacity);
			
 
				+	unsigned int* collapse_remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	unsigned char* collapse_locked = allocator.allocate<unsigned char>(vertex_count);
			
 
				+
			
 
				+	size_t result_count = index_count;
			
 
				+	float result_error = 0;
			
 
				+	float vertex_error = 0;
			
 
				+
			
 
				+	// target_error input is linear; we need to adjust it to match quadricError units
			
 
				+	float error_scale = (options & meshopt_SimplifyErrorAbsolute) ? vertex_scale : 1.f;
			
 
				+	float error_limit = (target_error * target_error) / (error_scale * error_scale);
			
 
				+
			
 
				+	while (result_count > target_index_count)
			
 
				+	{
			
 
				+		// note: throughout the simplification process adjacency structure reflects welded topology for result-in-progress
			
 
				+		updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
			
 
				+
			
 
				+		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop, loopback);
			
 
				+		assert(edge_collapse_count <= collapse_capacity);
			
 
				+
			
 
				+		// no edges can be collapsed any more due to topology restrictions
			
 
				+		if (edge_collapse_count == 0)
			
 
				+			break;
			
 
				+
			
 
				+#if TRACE
			
 
				+		printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' ');
			
 
				+#endif
			
 
				+
			
 
				+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap);
			
 
				+
			
 
				+		sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
			
 
				+
			
 
				+		size_t triangle_collapse_goal = (result_count - target_index_count) / 3;
			
 
				+
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+			collapse_remap[i] = unsigned(i);
			
 
				+
			
 
				+		memset(collapse_locked, 0, vertex_count);
			
 
				+
			
 
				+		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, loop, loopback, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
			
 
				+
			
 
				+		// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
			
 
				+		if (collapses == 0)
			
 
				+			break;
			
 
				+
			
 
				+		updateQuadrics(collapse_remap, vertex_count, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, vertex_positions, remap, vertex_error);
			
 
				+
			
 
				+		// updateQuadrics will update vertex error if we use attributes, but if we don't then result_error and vertex_error are equivalent
			
 
				+		vertex_error = attribute_count == 0 ? result_error : vertex_error;
			
 
				+
			
 
				+		remapEdgeLoops(loop, vertex_count, collapse_remap);
			
 
				+		remapEdgeLoops(loopback, vertex_count, collapse_remap);
			
 
				+
			
 
				+		size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
			
 
				+		assert(new_count < result_count);
			
 
				+
			
 
				+		result_count = new_count;
			
 
				+
			
 
				+		if ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= vertex_error)
			
 
				+			result_count = pruneComponents(result, result_count, components, component_errors, component_count, vertex_error, component_nexterror);
			
 
				+	}
			
 
				+
			
 
				+	// we're done with the regular simplification but we're still short of the target; try pruning more aggressively towards error_limit
			
 
				+	while ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= error_limit)
			
 
				+	{
			
 
				+#if TRACE
			
 
				+		printf("pass %d: cleanup; ", int(pass_count++));
			
 
				+#endif
			
 
				+
			
 
				+		float component_cutoff = component_nexterror * 1.5f < error_limit ? component_nexterror * 1.5f : error_limit;
			
 
				+
			
 
				+		// track maximum error in eligible components as we are increasing resulting error
			
 
				+		float component_maxerror = 0;
			
 
				+		for (size_t i = 0; i < component_count; ++i)
			
 
				+			if (component_errors[i] > component_maxerror && component_errors[i] <= component_cutoff)
			
 
				+				component_maxerror = component_errors[i];
			
 
				+
			
 
				+		size_t new_count = pruneComponents(result, result_count, components, component_errors, component_count, component_cutoff, component_nexterror);
			
 
				+		if (new_count == result_count)
			
 
				+			break;
			
 
				+
			
 
				+		result_count = new_count;
			
 
				+		result_error = result_error < component_maxerror ? component_maxerror : result_error;
			
 
				+		vertex_error = vertex_error < component_maxerror ? component_maxerror : vertex_error;
			
 
				+	}
			
 
				+
			
 
				+#if TRACE
			
 
				+	printf("result: %d triangles, error: %e; total %d passes\n", int(result_count / 3), sqrtf(result_error), int(pass_count));
			
 
				+#endif
			
 
				+
			
 
				+	// if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
			
 
				+	if ((options & meshopt_SimplifyInternalDebug) && !sparse_remap)
			
 
				+	{
			
 
				+		assert(Kind_Count <= 8 && vertex_count < (1 << 28)); // 3 bit kind, 1 bit loop
			
 
				+
			
 
				+		for (size_t i = 0; i < result_count; i += 3)
			
 
				+		{
			
 
				+			unsigned int a = result[i + 0], b = result[i + 1], c = result[i + 2];
			
 
				+
			
 
				+			result[i + 0] |= (vertex_kind[a] << 28) | (unsigned(loop[a] == b || loopback[b] == a) << 31);
			
 
				+			result[i + 1] |= (vertex_kind[b] << 28) | (unsigned(loop[b] == c || loopback[c] == b) << 31);
			
 
				+			result[i + 2] |= (vertex_kind[c] << 28) | (unsigned(loop[c] == a || loopback[a] == c) << 31);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// convert resulting indices back into the dense space of the larger mesh
			
 
				+	if (sparse_remap)
			
 
				+		for (size_t i = 0; i < result_count; ++i)
			
 
				+			result[i] = sparse_remap[result[i]];
			
 
				+
			
 
				+	// result_error is quadratic; we need to remap it back to linear
			
 
				+	if (out_result_error)
			
 
				+		*out_result_error = sqrtf(result_error) * error_scale;
			
 
				+
			
 
				+	return result_count;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
			
 
				+{
			
 
				+	return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, 0, NULL, 0, NULL, target_index_count, target_error, options, out_result_error);
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
			
 
				+{
			
 
				+	return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, out_result_error);
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+	assert(target_index_count <= index_count);
			
 
				+
			
 
				+	// we expect to get ~2 triangles/vertex in the output
			
 
				+	size_t target_cell_count = target_index_count / 6;
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
			
 
				+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
			
 
				+
			
 
				+	// find the optimal grid size using guided binary search
			
 
				+#if TRACE
			
 
				+	printf("source: %d vertices, %d triangles\n", int(vertex_count), int(index_count / 3));
			
 
				+	printf("target: %d cells, %d triangles\n", int(target_cell_count), int(target_index_count / 3));
			
 
				+#endif
			
 
				+
			
 
				+	unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
			
 
				+
			
 
				+	const int kInterpolationPasses = 5;
			
 
				+
			
 
				+	// invariant: # of triangles in min_grid <= target_count
			
 
				+	int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : target_error));
			
 
				+	int max_grid = 1025;
			
 
				+	size_t min_triangles = 0;
			
 
				+	size_t max_triangles = index_count / 3;
			
 
				+
			
 
				+	// when we're error-limited, we compute the triangle count for the min. size; this accelerates convergence and provides the correct answer when we can't use a larger grid
			
 
				+	if (min_grid > 1)
			
 
				+	{
			
 
				+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
			
 
				+		min_triangles = countTriangles(vertex_ids, indices, index_count);
			
 
				+	}
			
 
				+
			
 
				+	// instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
			
 
				+	int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
			
 
				+
			
 
				+	for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
			
 
				+	{
			
 
				+		if (min_triangles >= target_index_count / 3 || max_grid - min_grid <= 1)
			
 
				+			break;
			
 
				+
			
 
				+		// we clamp the prediction of the grid size to make sure that the search converges
			
 
				+		int grid_size = next_grid_size;
			
 
				+		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size);
			
 
				+
			
 
				+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
			
 
				+		size_t triangles = countTriangles(vertex_ids, indices, index_count);
			
 
				+
			
 
				+#if TRACE
			
 
				+		printf("pass %d (%s): grid size %d, triangles %d, %s\n",
			
 
				+		    pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses ? "lerp" : "binary"),
			
 
				+		    grid_size, int(triangles),
			
 
				+		    (triangles <= target_index_count / 3) ? "under" : "over");
			
 
				+#endif
			
 
				+
			
 
				+		float tip = interpolate(float(size_t(target_index_count / 3)), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
			
 
				+
			
 
				+		if (triangles <= target_index_count / 3)
			
 
				+		{
			
 
				+			min_grid = grid_size;
			
 
				+			min_triangles = triangles;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			max_grid = grid_size;
			
 
				+			max_triangles = triangles;
			
 
				+		}
			
 
				+
			
 
				+		// we start by using interpolation search - it usually converges faster
			
 
				+		// however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
			
 
				+		next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
			
 
				+	}
			
 
				+
			
 
				+	if (min_triangles == 0)
			
 
				+	{
			
 
				+		if (out_result_error)
			
 
				+			*out_result_error = 1.f;
			
 
				+
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
			
 
				+	size_t table_size = hashBuckets2(vertex_count);
			
 
				+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
			
 
				+
			
 
				+	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
			
 
				+
			
 
				+	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
			
 
				+	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
			
 
				+
			
 
				+	// build a quadric for each target cell
			
 
				+	Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
			
 
				+	memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
			
 
				+
			
 
				+	fillCellQuadrics(cell_quadrics, indices, index_count, vertex_positions, vertex_cells);
			
 
				+
			
 
				+	// for each target cell, find the vertex with the minimal error
			
 
				+	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
			
 
				+	float* cell_errors = allocator.allocate<float>(cell_count);
			
 
				+
			
 
				+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
			
 
				+
			
 
				+	// compute error
			
 
				+	float result_error = 0.f;
			
 
				+
			
 
				+	for (size_t i = 0; i < cell_count; ++i)
			
 
				+		result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error;
			
 
				+
			
 
				+	// collapse triangles!
			
 
				+	// note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :(
			
 
				+	size_t tritable_size = hashBuckets2(min_triangles);
			
 
				+	unsigned int* tritable = allocator.allocate<unsigned int>(tritable_size);
			
 
				+
			
 
				+	size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap);
			
 
				+
			
 
				+#if TRACE
			
 
				+	printf("result: %d cells, %d triangles (%d unfiltered), error %e\n", int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error));
			
 
				+#endif
			
 
				+
			
 
				+	if (out_result_error)
			
 
				+		*out_result_error = sqrtf(result_error);
			
 
				+
			
 
				+	return write;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+	assert(vertex_colors_stride == 0 || (vertex_colors_stride >= 12 && vertex_colors_stride <= 256));
			
 
				+	assert(vertex_colors_stride % sizeof(float) == 0);
			
 
				+	assert(vertex_colors == NULL || vertex_colors_stride != 0);
			
 
				+	assert(target_vertex_count <= vertex_count);
			
 
				+
			
 
				+	size_t target_cell_count = target_vertex_count;
			
 
				+
			
 
				+	if (target_cell_count == 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
			
 
				+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
			
 
				+
			
 
				+	// find the optimal grid size using guided binary search
			
 
				+#if TRACE
			
 
				+	printf("source: %d vertices\n", int(vertex_count));
			
 
				+	printf("target: %d cells\n", int(target_cell_count));
			
 
				+#endif
			
 
				+
			
 
				+	unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
			
 
				+
			
 
				+	size_t table_size = hashBuckets2(vertex_count);
			
 
				+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
			
 
				+
			
 
				+	const int kInterpolationPasses = 5;
			
 
				+
			
 
				+	// invariant: # of vertices in min_grid <= target_count
			
 
				+	int min_grid = 0;
			
 
				+	int max_grid = 1025;
			
 
				+	size_t min_vertices = 0;
			
 
				+	size_t max_vertices = vertex_count;
			
 
				+
			
 
				+	// instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
			
 
				+	int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
			
 
				+
			
 
				+	for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
			
 
				+	{
			
 
				+		assert(min_vertices < target_vertex_count);
			
 
				+		assert(max_grid - min_grid > 1);
			
 
				+
			
 
				+		// we clamp the prediction of the grid size to make sure that the search converges
			
 
				+		int grid_size = next_grid_size;
			
 
				+		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size);
			
 
				+
			
 
				+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
			
 
				+		size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count);
			
 
				+
			
 
				+#if TRACE
			
 
				+		printf("pass %d (%s): grid size %d, vertices %d, %s\n",
			
 
				+		    pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses ? "lerp" : "binary"),
			
 
				+		    grid_size, int(vertices),
			
 
				+		    (vertices <= target_vertex_count) ? "under" : "over");
			
 
				+#endif
			
 
				+
			
 
				+		float tip = interpolate(float(target_vertex_count), float(min_grid), float(min_vertices), float(grid_size), float(vertices), float(max_grid), float(max_vertices));
			
 
				+
			
 
				+		if (vertices <= target_vertex_count)
			
 
				+		{
			
 
				+			min_grid = grid_size;
			
 
				+			min_vertices = vertices;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			max_grid = grid_size;
			
 
				+			max_vertices = vertices;
			
 
				+		}
			
 
				+
			
 
				+		if (vertices == target_vertex_count || max_grid - min_grid <= 1)
			
 
				+			break;
			
 
				+
			
 
				+		// we start by using interpolation search - it usually converges faster
			
 
				+		// however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
			
 
				+		next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
			
 
				+	}
			
 
				+
			
 
				+	if (min_vertices == 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
			
 
				+	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
			
 
				+
			
 
				+	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
			
 
				+	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
			
 
				+
			
 
				+	// accumulate points into a reservoir for each target cell
			
 
				+	Reservoir* cell_reservoirs = allocator.allocate<Reservoir>(cell_count);
			
 
				+	memset(cell_reservoirs, 0, cell_count * sizeof(Reservoir));
			
 
				+
			
 
				+	fillCellReservoirs(cell_reservoirs, cell_count, vertex_positions, vertex_colors, vertex_colors_stride, vertex_count, vertex_cells);
			
 
				+
			
 
				+	// for each target cell, find the vertex with the minimal error
			
 
				+	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
			
 
				+	float* cell_errors = allocator.allocate<float>(cell_count);
			
 
				+
			
 
				+	// we scale the color weight to bring it to the same scale as position so that error addition makes sense
			
 
				+	float color_weight_scaled = color_weight * (min_grid == 1 ? 1.f : 1.f / (min_grid - 1));
			
 
				+
			
 
				+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight_scaled * color_weight_scaled, vertex_count);
			
 
				+
			
 
				+	// copy results to the output
			
 
				+	assert(cell_count <= target_vertex_count);
			
 
				+	memcpy(destination, cell_remap, sizeof(unsigned int) * cell_count);
			
 
				+
			
 
				+#if TRACE
			
 
				+	// compute error
			
 
				+	float result_error = 0.f;
			
 
				+
			
 
				+	for (size_t i = 0; i < cell_count; ++i)
			
 
				+		result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error;
			
 
				+
			
 
				+	printf("result: %d cells, %e error\n", int(cell_count), sqrtf(result_error));
			
 
				+#endif
			
 
				+
			
 
				+	return cell_count;
			
 
				+}
			
 
				+
			
 
				+float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	float extent = rescalePositions(NULL, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+
			
 
				+	return extent;
			
 
				+}
			
--- a/include/meshoptimizer/spatialorder.cpp
+++ b/include/meshoptimizer/spatialorder.cpp
@@ -0,0 +1,194 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <float.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// This work is based on:
			
 
				+// Fabian Giesen. Decoding Morton codes. 2009
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+// "Insert" two 0 bits after each of the 10 low bits of x
			
 
				+inline unsigned int part1By2(unsigned int x)
			
 
				+{
			
 
				+	x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
			
 
				+	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
			
 
				+	x = (x ^ (x << 8)) & 0x0300f00f;  // x = ---- --98 ---- ---- 7654 ---- ---- 3210
			
 
				+	x = (x ^ (x << 4)) & 0x030c30c3;  // x = ---- --98 ---- 76-- --54 ---- 32-- --10
			
 
				+	x = (x ^ (x << 2)) & 0x09249249;  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
			
 
				+	return x;
			
 
				+}
			
 
				+
			
 
				+static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				+
			
 
				+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
			
 
				+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		const float* v = vertex_positions_data + i * vertex_stride_float;
			
 
				+
			
 
				+		for (int j = 0; j < 3; ++j)
			
 
				+		{
			
 
				+			float vj = v[j];
			
 
				+
			
 
				+			minv[j] = minv[j] > vj ? vj : minv[j];
			
 
				+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	float extent = 0.f;
			
 
				+
			
 
				+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
			
 
				+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
			
 
				+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
			
 
				+
			
 
				+	float scale = extent == 0 ? 0.f : 1.f / extent;
			
 
				+
			
 
				+	// generate Morton order based on the position inside a unit cube
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		const float* v = vertex_positions_data + i * vertex_stride_float;
			
 
				+
			
 
				+		int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
			
 
				+		int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
			
 
				+		int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
			
 
				+
			
 
				+		result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
			
 
				+{
			
 
				+	memset(hist, 0, sizeof(hist));
			
 
				+
			
 
				+	// compute 3 10-bit histograms in parallel
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		unsigned int id = data[i];
			
 
				+
			
 
				+		hist[(id >> 0) & 1023][0]++;
			
 
				+		hist[(id >> 10) & 1023][1]++;
			
 
				+		hist[(id >> 20) & 1023][2]++;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int sumx = 0, sumy = 0, sumz = 0;
			
 
				+
			
 
				+	// replace histogram data with prefix histogram sums in-place
			
 
				+	for (int i = 0; i < 1024; ++i)
			
 
				+	{
			
 
				+		unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
			
 
				+
			
 
				+		hist[i][0] = sumx;
			
 
				+		hist[i][1] = sumy;
			
 
				+		hist[i][2] = sumz;
			
 
				+
			
 
				+		sumx += hx;
			
 
				+		sumy += hy;
			
 
				+		sumz += hz;
			
 
				+	}
			
 
				+
			
 
				+	assert(sumx == count && sumy == count && sumz == count);
			
 
				+}
			
 
				+
			
 
				+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
			
 
				+{
			
 
				+	int bitoff = pass * 10;
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		unsigned int id = (keys[source[i]] >> bitoff) & 1023;
			
 
				+
			
 
				+		destination[hist[id][pass]++] = source[i];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
			
 
				+
			
 
				+	unsigned int hist[1024][3];
			
 
				+	computeHistogram(hist, keys, vertex_count);
			
 
				+
			
 
				+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		destination[i] = unsigned(i);
			
 
				+
			
 
				+	// 3-pass radix sort computes the resulting order into scratch
			
 
				+	radixPass(scratch, destination, keys, vertex_count, hist, 0);
			
 
				+	radixPass(destination, scratch, keys, vertex_count, hist, 1);
			
 
				+	radixPass(scratch, destination, keys, vertex_count, hist, 2);
			
 
				+
			
 
				+	// since our remap table is mapping old=>new, we need to reverse it
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		destination[scratch[i]] = unsigned(i);
			
 
				+}
			
 
				+
			
 
				+void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				+
			
 
				+	(void)vertex_count;
			
 
				+
			
 
				+	size_t face_count = index_count / 3;
			
 
				+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	float* centroids = allocator.allocate<float>(face_count * 3);
			
 
				+
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		const float* va = vertex_positions + a * vertex_stride_float;
			
 
				+		const float* vb = vertex_positions + b * vertex_stride_float;
			
 
				+		const float* vc = vertex_positions + c * vertex_stride_float;
			
 
				+
			
 
				+		centroids[i * 3 + 0] = (va[0] + vb[0] + vc[0]) / 3.f;
			
 
				+		centroids[i * 3 + 1] = (va[1] + vb[1] + vc[1]) / 3.f;
			
 
				+		centroids[i * 3 + 2] = (va[2] + vb[2] + vc[2]) / 3.f;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int* remap = allocator.allocate<unsigned int>(face_count);
			
 
				+
			
 
				+	meshopt_spatialSortRemap(remap, centroids, face_count, sizeof(float) * 3);
			
 
				+
			
 
				+	// support in-order remap
			
 
				+	if (destination == indices)
			
 
				+	{
			
 
				+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
			
 
				+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
			
 
				+		indices = indices_copy;
			
 
				+	}
			
 
				+
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
			
 
				+		unsigned int r = remap[i];
			
 
				+
			
 
				+		destination[r * 3 + 0] = a;
			
 
				+		destination[r * 3 + 1] = b;
			
 
				+		destination[r * 3 + 2] = c;
			
 
				+	}
			
 
				+}
			
--- a/include/meshoptimizer/stripifier.cpp
+++ b/include/meshoptimizer/stripifier.cpp
@@ -0,0 +1,296 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// This work is based on:
			
 
				+// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned char* valence)
			
 
				+{
			
 
				+	unsigned int index = 0;
			
 
				+	unsigned int iv = ~0u;
			
 
				+
			
 
				+	for (size_t i = 0; i < buffer_size; ++i)
			
 
				+	{
			
 
				+		unsigned char va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
			
 
				+		unsigned int v = (va < vb && va < vc) ? va : (vb < vc ? vb : vc);
			
 
				+
			
 
				+		if (v < iv)
			
 
				+		{
			
 
				+			index = unsigned(i);
			
 
				+			iv = v;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return index;
			
 
				+}
			
 
				+
			
 
				+static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1)
			
 
				+{
			
 
				+	for (size_t i = 0; i < buffer_size; ++i)
			
 
				+	{
			
 
				+		unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
			
 
				+
			
 
				+		if (e0 == a && e1 == b)
			
 
				+			return (int(i) << 2) | 2;
			
 
				+		else if (e0 == b && e1 == c)
			
 
				+			return (int(i) << 2) | 0;
			
 
				+		else if (e0 == c && e1 == a)
			
 
				+			return (int(i) << 2) | 1;
			
 
				+	}
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index)
			
 
				+{
			
 
				+	assert(destination != indices);
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	const size_t buffer_capacity = 8;
			
 
				+
			
 
				+	unsigned int buffer[buffer_capacity][3] = {};
			
 
				+	unsigned int buffer_size = 0;
			
 
				+
			
 
				+	size_t index_offset = 0;
			
 
				+
			
 
				+	unsigned int strip[2] = {};
			
 
				+	unsigned int parity = 0;
			
 
				+
			
 
				+	size_t strip_size = 0;
			
 
				+
			
 
				+	// compute vertex valence; this is used to prioritize starting triangle for strips
			
 
				+	// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
			
 
				+	unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
			
 
				+	memset(valence, 0, vertex_count);
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		valence[index]++;
			
 
				+	}
			
 
				+
			
 
				+	int next = -1;
			
 
				+
			
 
				+	while (buffer_size > 0 || index_offset < index_count)
			
 
				+	{
			
 
				+		assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3));
			
 
				+
			
 
				+		// fill triangle buffer
			
 
				+		while (buffer_size < buffer_capacity && index_offset < index_count)
			
 
				+		{
			
 
				+			buffer[buffer_size][0] = indices[index_offset + 0];
			
 
				+			buffer[buffer_size][1] = indices[index_offset + 1];
			
 
				+			buffer[buffer_size][2] = indices[index_offset + 2];
			
 
				+
			
 
				+			buffer_size++;
			
 
				+			index_offset += 3;
			
 
				+		}
			
 
				+
			
 
				+		assert(buffer_size > 0);
			
 
				+
			
 
				+		if (next >= 0)
			
 
				+		{
			
 
				+			unsigned int i = next >> 2;
			
 
				+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
			
 
				+			unsigned int v = buffer[i][next & 3];
			
 
				+
			
 
				+			// ordered removal from the buffer
			
 
				+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
			
 
				+			buffer_size--;
			
 
				+
			
 
				+			// update vertex valences for strip start heuristic
			
 
				+			valence[a]--;
			
 
				+			valence[b]--;
			
 
				+			valence[c]--;
			
 
				+
			
 
				+			// find next triangle (note that edge order flips on every iteration)
			
 
				+			// in some cases we need to perform a swap to pick a different outgoing triangle edge
			
 
				+			// for [a b c], the default strip edge is [b c], but we might want to use [a c]
			
 
				+			int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]);
			
 
				+			int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1;
			
 
				+
			
 
				+			if (cont < 0 && swap >= 0)
			
 
				+			{
			
 
				+				// [a b c] => [a b a c]
			
 
				+				destination[strip_size++] = strip[0];
			
 
				+				destination[strip_size++] = v;
			
 
				+
			
 
				+				// next strip has same winding
			
 
				+				// ? a b => b a v
			
 
				+				strip[1] = v;
			
 
				+
			
 
				+				next = swap;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// emit the next vertex in the strip
			
 
				+				destination[strip_size++] = v;
			
 
				+
			
 
				+				// next strip has flipped winding
			
 
				+				strip[0] = strip[1];
			
 
				+				strip[1] = v;
			
 
				+				parity ^= 1;
			
 
				+
			
 
				+				next = cont;
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// if we didn't find anything, we need to find the next new triangle
			
 
				+			// we use a heuristic to maximize the strip length
			
 
				+			unsigned int i = findStripFirst(buffer, buffer_size, valence);
			
 
				+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
			
 
				+
			
 
				+			// ordered removal from the buffer
			
 
				+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
			
 
				+			buffer_size--;
			
 
				+
			
 
				+			// update vertex valences for strip start heuristic
			
 
				+			valence[a]--;
			
 
				+			valence[b]--;
			
 
				+			valence[c]--;
			
 
				+
			
 
				+			// we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration
			
 
				+			int ea = findStripNext(buffer, buffer_size, c, b);
			
 
				+			int eb = findStripNext(buffer, buffer_size, a, c);
			
 
				+			int ec = findStripNext(buffer, buffer_size, b, a);
			
 
				+
			
 
				+			// in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest
			
 
				+			// triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear
			
 
				+			// reasons - slightly improves the stripification efficiency
			
 
				+			int mine = INT_MAX;
			
 
				+			mine = (ea >= 0 && mine > ea) ? ea : mine;
			
 
				+			mine = (eb >= 0 && mine > eb) ? eb : mine;
			
 
				+			mine = (ec >= 0 && mine > ec) ? ec : mine;
			
 
				+
			
 
				+			if (ea == mine)
			
 
				+			{
			
 
				+				// keep abc
			
 
				+				next = ea;
			
 
				+			}
			
 
				+			else if (eb == mine)
			
 
				+			{
			
 
				+				// abc -> bca
			
 
				+				unsigned int t = a;
			
 
				+				a = b, b = c, c = t;
			
 
				+
			
 
				+				next = eb;
			
 
				+			}
			
 
				+			else if (ec == mine)
			
 
				+			{
			
 
				+				// abc -> cab
			
 
				+				unsigned int t = c;
			
 
				+				c = b, b = a, a = t;
			
 
				+
			
 
				+				next = ec;
			
 
				+			}
			
 
				+
			
 
				+			if (restart_index)
			
 
				+			{
			
 
				+				if (strip_size)
			
 
				+					destination[strip_size++] = restart_index;
			
 
				+
			
 
				+				destination[strip_size++] = a;
			
 
				+				destination[strip_size++] = b;
			
 
				+				destination[strip_size++] = c;
			
 
				+
			
 
				+				// new strip always starts with the same edge winding
			
 
				+				strip[0] = b;
			
 
				+				strip[1] = c;
			
 
				+				parity = 1;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				if (strip_size)
			
 
				+				{
			
 
				+					// connect last strip using degenerate triangles
			
 
				+					destination[strip_size++] = strip[1];
			
 
				+					destination[strip_size++] = a;
			
 
				+				}
			
 
				+
			
 
				+				// note that we may need to flip the emitted triangle based on parity
			
 
				+				// we always end up with outgoing edge "cb" in the end
			
 
				+				unsigned int e0 = parity ? c : b;
			
 
				+				unsigned int e1 = parity ? b : c;
			
 
				+
			
 
				+				destination[strip_size++] = a;
			
 
				+				destination[strip_size++] = e0;
			
 
				+				destination[strip_size++] = e1;
			
 
				+
			
 
				+				strip[0] = e0;
			
 
				+				strip[1] = e1;
			
 
				+				parity ^= 1;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return strip_size;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_stripifyBound(size_t index_count)
			
 
				+{
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	// worst case without restarts is 2 degenerate indices and 3 indices per triangle
			
 
				+	// worst case with restarts is 1 restart index and 3 indices per triangle
			
 
				+	return (index_count / 3) * 5;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index)
			
 
				+{
			
 
				+	assert(destination != indices);
			
 
				+
			
 
				+	size_t offset = 0;
			
 
				+	size_t start = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		if (restart_index && indices[i] == restart_index)
			
 
				+		{
			
 
				+			start = i + 1;
			
 
				+		}
			
 
				+		else if (i - start >= 2)
			
 
				+		{
			
 
				+			unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i];
			
 
				+
			
 
				+			// flip winding for odd triangles
			
 
				+			if ((i - start) & 1)
			
 
				+			{
			
 
				+				unsigned int t = a;
			
 
				+				a = b, b = t;
			
 
				+			}
			
 
				+
			
 
				+			// although we use restart indices, strip swaps still produce degenerate triangles, so skip them
			
 
				+			if (a != b && a != c && b != c)
			
 
				+			{
			
 
				+				destination[offset + 0] = a;
			
 
				+				destination[offset + 1] = b;
			
 
				+				destination[offset + 2] = c;
			
 
				+				offset += 3;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return offset;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_unstripifyBound(size_t index_count)
			
 
				+{
			
 
				+	assert(index_count == 0 || index_count >= 3);
			
 
				+
			
 
				+	return (index_count == 0) ? 0 : (index_count - 2) * 3;
			
 
				+}
			
--- a/include/meshoptimizer/vcacheanalyzer.cpp
+++ b/include/meshoptimizer/vcacheanalyzer.cpp
@@ -0,0 +1,73 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
			
 
				+{
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(cache_size >= 3);
			
 
				+	assert(warp_size == 0 || warp_size >= 3);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	meshopt_VertexCacheStatistics result = {};
			
 
				+
			
 
				+	unsigned int warp_offset = 0;
			
 
				+	unsigned int primgroup_offset = 0;
			
 
				+
			
 
				+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	unsigned int timestamp = cache_size + 1;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; i += 3)
			
 
				+	{
			
 
				+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
			
 
				+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
			
 
				+
			
 
				+		bool ac = (timestamp - cache_timestamps[a]) > cache_size;
			
 
				+		bool bc = (timestamp - cache_timestamps[b]) > cache_size;
			
 
				+		bool cc = (timestamp - cache_timestamps[c]) > cache_size;
			
 
				+
			
 
				+		// flush cache if triangle doesn't fit into warp or into the primitive buffer
			
 
				+		if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size))
			
 
				+		{
			
 
				+			result.warps_executed += warp_offset > 0;
			
 
				+
			
 
				+			warp_offset = 0;
			
 
				+			primgroup_offset = 0;
			
 
				+
			
 
				+			// reset cache
			
 
				+			timestamp += cache_size + 1;
			
 
				+		}
			
 
				+
			
 
				+		// update cache and add vertices to warp
			
 
				+		for (int j = 0; j < 3; ++j)
			
 
				+		{
			
 
				+			unsigned int index = indices[i + j];
			
 
				+
			
 
				+			if (timestamp - cache_timestamps[index] > cache_size)
			
 
				+			{
			
 
				+				cache_timestamps[index] = timestamp++;
			
 
				+				result.vertices_transformed++;
			
 
				+				warp_offset++;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		primgroup_offset++;
			
 
				+	}
			
 
				+
			
 
				+	size_t unique_vertex_count = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		unique_vertex_count += cache_timestamps[i] > 0;
			
 
				+
			
 
				+	result.warps_executed += warp_offset > 0;
			
 
				+
			
 
				+	result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3);
			
 
				+	result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count);
			
 
				+
			
 
				+	return result;
			
 
				+}
			
--- a/include/meshoptimizer/vcacheoptimizer.cpp
+++ b/include/meshoptimizer/vcacheoptimizer.cpp
@@ -0,0 +1,467 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// This work is based on:
			
 
				+// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
			
 
				+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+const size_t kCacheSizeMax = 16;
			
 
				+const size_t kValenceMax = 8;
			
 
				+
			
 
				+struct VertexScoreTable
			
 
				+{
			
 
				+	float cache[1 + kCacheSizeMax];
			
 
				+	float live[1 + kValenceMax];
			
 
				+};
			
 
				+
			
 
				+// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
			
 
				+static const VertexScoreTable kVertexScoreTable = {
			
 
				+    {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
			
 
				+    {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
			
 
				+};
			
 
				+
			
 
				+// Tuned to minimize the encoded index buffer size
			
 
				+static const VertexScoreTable kVertexScoreTableStrip = {
			
 
				+    {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
			
 
				+    {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
			
 
				+};
			
 
				+
			
 
				+struct TriangleAdjacency
			
 
				+{
			
 
				+	unsigned int* counts;
			
 
				+	unsigned int* offsets;
			
 
				+	unsigned int* data;
			
 
				+};
			
 
				+
			
 
				+static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
			
 
				+{
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	// allocate arrays
			
 
				+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	adjacency.data = allocator.allocate<unsigned int>(index_count);
			
 
				+
			
 
				+	// fill triangle counts
			
 
				+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		assert(indices[i] < vertex_count);
			
 
				+
			
 
				+		adjacency.counts[indices[i]]++;
			
 
				+	}
			
 
				+
			
 
				+	// fill offset table
			
 
				+	unsigned int offset = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		adjacency.offsets[i] = offset;
			
 
				+		offset += adjacency.counts[i];
			
 
				+	}
			
 
				+
			
 
				+	assert(offset == index_count);
			
 
				+
			
 
				+	// fill triangle data
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
			
 
				+
			
 
				+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
			
 
				+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
			
 
				+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
			
 
				+	}
			
 
				+
			
 
				+	// fix offsets that have been disturbed by the previous pass
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+	{
			
 
				+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
			
 
				+
			
 
				+		adjacency.offsets[i] -= adjacency.counts[i];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
			
 
				+{
			
 
				+	// check dead-end stack
			
 
				+	while (dead_end_top)
			
 
				+	{
			
 
				+		unsigned int vertex = dead_end[--dead_end_top];
			
 
				+
			
 
				+		if (live_triangles[vertex] > 0)
			
 
				+			return vertex;
			
 
				+	}
			
 
				+
			
 
				+	// input order
			
 
				+	while (input_cursor < vertex_count)
			
 
				+	{
			
 
				+		if (live_triangles[input_cursor] > 0)
			
 
				+			return input_cursor;
			
 
				+
			
 
				+		++input_cursor;
			
 
				+	}
			
 
				+
			
 
				+	return ~0u;
			
 
				+}
			
 
				+
			
 
				+static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
			
 
				+{
			
 
				+	unsigned int best_candidate = ~0u;
			
 
				+	int best_priority = -1;
			
 
				+
			
 
				+	for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
			
 
				+	{
			
 
				+		unsigned int vertex = *next_candidate;
			
 
				+
			
 
				+		// otherwise we don't need to process it
			
 
				+		if (live_triangles[vertex] > 0)
			
 
				+		{
			
 
				+			int priority = 0;
			
 
				+
			
 
				+			// will it be in cache after fanning?
			
 
				+			if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
			
 
				+			{
			
 
				+				priority = timestamp - cache_timestamps[vertex]; // position in cache
			
 
				+			}
			
 
				+
			
 
				+			if (priority > best_priority)
			
 
				+			{
			
 
				+				best_candidate = vertex;
			
 
				+				best_priority = priority;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return best_candidate;
			
 
				+}
			
 
				+
			
 
				+static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
			
 
				+{
			
 
				+	assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
			
 
				+
			
 
				+	unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
			
 
				+
			
 
				+	return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
			
 
				+}
			
 
				+
			
 
				+static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
			
 
				+{
			
 
				+	// input order
			
 
				+	while (input_cursor < face_count)
			
 
				+	{
			
 
				+		if (!emitted_flags[input_cursor])
			
 
				+			return input_cursor;
			
 
				+
			
 
				+		++input_cursor;
			
 
				+	}
			
 
				+
			
 
				+	return ~0u;
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	// guard for empty meshes
			
 
				+	if (index_count == 0 || vertex_count == 0)
			
 
				+		return;
			
 
				+
			
 
				+	// support in-place optimization
			
 
				+	if (destination == indices)
			
 
				+	{
			
 
				+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
			
 
				+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
			
 
				+		indices = indices_copy;
			
 
				+	}
			
 
				+
			
 
				+	unsigned int cache_size = 16;
			
 
				+	assert(cache_size <= kCacheSizeMax);
			
 
				+
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	// build adjacency information
			
 
				+	TriangleAdjacency adjacency = {};
			
 
				+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
			
 
				+
			
 
				+	// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
			
 
				+	unsigned int* live_triangles = adjacency.counts;
			
 
				+
			
 
				+	// emitted flags
			
 
				+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
			
 
				+	memset(emitted_flags, 0, face_count);
			
 
				+
			
 
				+	// compute initial vertex scores
			
 
				+	float* vertex_scores = allocator.allocate<float>(vertex_count);
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
			
 
				+
			
 
				+	// compute triangle scores
			
 
				+	float* triangle_scores = allocator.allocate<float>(face_count);
			
 
				+
			
 
				+	for (size_t i = 0; i < face_count; ++i)
			
 
				+	{
			
 
				+		unsigned int a = indices[i * 3 + 0];
			
 
				+		unsigned int b = indices[i * 3 + 1];
			
 
				+		unsigned int c = indices[i * 3 + 2];
			
 
				+
			
 
				+		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
			
 
				+	}
			
 
				+
			
 
				+	unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
			
 
				+	unsigned int* cache = cache_holder;
			
 
				+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
			
 
				+	size_t cache_count = 0;
			
 
				+
			
 
				+	unsigned int current_triangle = 0;
			
 
				+	unsigned int input_cursor = 1;
			
 
				+
			
 
				+	unsigned int output_triangle = 0;
			
 
				+
			
 
				+	while (current_triangle != ~0u)
			
 
				+	{
			
 
				+		assert(output_triangle < face_count);
			
 
				+
			
 
				+		unsigned int a = indices[current_triangle * 3 + 0];
			
 
				+		unsigned int b = indices[current_triangle * 3 + 1];
			
 
				+		unsigned int c = indices[current_triangle * 3 + 2];
			
 
				+
			
 
				+		// output indices
			
 
				+		destination[output_triangle * 3 + 0] = a;
			
 
				+		destination[output_triangle * 3 + 1] = b;
			
 
				+		destination[output_triangle * 3 + 2] = c;
			
 
				+		output_triangle++;
			
 
				+
			
 
				+		// update emitted flags
			
 
				+		emitted_flags[current_triangle] = true;
			
 
				+		triangle_scores[current_triangle] = 0;
			
 
				+
			
 
				+		// new triangle
			
 
				+		size_t cache_write = 0;
			
 
				+		cache_new[cache_write++] = a;
			
 
				+		cache_new[cache_write++] = b;
			
 
				+		cache_new[cache_write++] = c;
			
 
				+
			
 
				+		// old triangles
			
 
				+		for (size_t i = 0; i < cache_count; ++i)
			
 
				+		{
			
 
				+			unsigned int index = cache[i];
			
 
				+
			
 
				+			cache_new[cache_write] = index;
			
 
				+			cache_write += (index != a) & (index != b) & (index != c);
			
 
				+		}
			
 
				+
			
 
				+		unsigned int* cache_temp = cache;
			
 
				+		cache = cache_new, cache_new = cache_temp;
			
 
				+		cache_count = cache_write > cache_size ? cache_size : cache_write;
			
 
				+
			
 
				+		// remove emitted triangle from adjacency data
			
 
				+		// this makes sure that we spend less time traversing these lists on subsequent iterations
			
 
				+		// live triangle counts are updated as a byproduct of these adjustments
			
 
				+		for (size_t k = 0; k < 3; ++k)
			
 
				+		{
			
 
				+			unsigned int index = indices[current_triangle * 3 + k];
			
 
				+
			
 
				+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
			
 
				+			size_t neighbors_size = adjacency.counts[index];
			
 
				+
			
 
				+			for (size_t i = 0; i < neighbors_size; ++i)
			
 
				+			{
			
 
				+				unsigned int tri = neighbors[i];
			
 
				+
			
 
				+				if (tri == current_triangle)
			
 
				+				{
			
 
				+					neighbors[i] = neighbors[neighbors_size - 1];
			
 
				+					adjacency.counts[index]--;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		unsigned int best_triangle = ~0u;
			
 
				+		float best_score = 0;
			
 
				+
			
 
				+		// update cache positions, vertex scores and triangle scores, and find next best triangle
			
 
				+		for (size_t i = 0; i < cache_write; ++i)
			
 
				+		{
			
 
				+			unsigned int index = cache[i];
			
 
				+
			
 
				+			// no need to update scores if we are never going to use this vertex
			
 
				+			if (adjacency.counts[index] == 0)
			
 
				+				continue;
			
 
				+
			
 
				+			int cache_position = i >= cache_size ? -1 : int(i);
			
 
				+
			
 
				+			// update vertex score
			
 
				+			float score = vertexScore(table, cache_position, live_triangles[index]);
			
 
				+			float score_diff = score - vertex_scores[index];
			
 
				+
			
 
				+			vertex_scores[index] = score;
			
 
				+
			
 
				+			// update scores of vertex triangles
			
 
				+			const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
			
 
				+			const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
			
 
				+
			
 
				+			for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
			
 
				+			{
			
 
				+				unsigned int tri = *it;
			
 
				+				assert(!emitted_flags[tri]);
			
 
				+
			
 
				+				float tri_score = triangle_scores[tri] + score_diff;
			
 
				+				assert(tri_score > 0);
			
 
				+
			
 
				+				best_triangle = best_score < tri_score ? tri : best_triangle;
			
 
				+				best_score = best_score < tri_score ? tri_score : best_score;
			
 
				+
			
 
				+				triangle_scores[tri] = tri_score;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// step through input triangles in order if we hit a dead-end
			
 
				+		current_triangle = best_triangle;
			
 
				+
			
 
				+		if (current_triangle == ~0u)
			
 
				+		{
			
 
				+			current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	assert(input_cursor == face_count);
			
 
				+	assert(output_triangle == face_count);
			
 
				+}
			
 
				+
			
 
				+void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
			
 
				+}
			
 
				+
			
 
				+void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
			
 
				+}
			
 
				+
			
 
				+void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(cache_size >= 3);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	// guard for empty meshes
			
 
				+	if (index_count == 0 || vertex_count == 0)
			
 
				+		return;
			
 
				+
			
 
				+	// support in-place optimization
			
 
				+	if (destination == indices)
			
 
				+	{
			
 
				+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
			
 
				+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
			
 
				+		indices = indices_copy;
			
 
				+	}
			
 
				+
			
 
				+	size_t face_count = index_count / 3;
			
 
				+
			
 
				+	// build adjacency information
			
 
				+	TriangleAdjacency adjacency = {};
			
 
				+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
			
 
				+
			
 
				+	// live triangle counts
			
 
				+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	// cache time stamps
			
 
				+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	// dead-end stack
			
 
				+	unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
			
 
				+	unsigned int dead_end_top = 0;
			
 
				+
			
 
				+	// emitted flags
			
 
				+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
			
 
				+	memset(emitted_flags, 0, face_count);
			
 
				+
			
 
				+	unsigned int current_vertex = 0;
			
 
				+
			
 
				+	unsigned int timestamp = cache_size + 1;
			
 
				+	unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
			
 
				+
			
 
				+	unsigned int output_triangle = 0;
			
 
				+
			
 
				+	while (current_vertex != ~0u)
			
 
				+	{
			
 
				+		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
			
 
				+
			
 
				+		// emit all vertex neighbors
			
 
				+		const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
			
 
				+		const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
			
 
				+
			
 
				+		for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
			
 
				+		{
			
 
				+			unsigned int triangle = *it;
			
 
				+
			
 
				+			if (!emitted_flags[triangle])
			
 
				+			{
			
 
				+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
			
 
				+
			
 
				+				// output indices
			
 
				+				destination[output_triangle * 3 + 0] = a;
			
 
				+				destination[output_triangle * 3 + 1] = b;
			
 
				+				destination[output_triangle * 3 + 2] = c;
			
 
				+				output_triangle++;
			
 
				+
			
 
				+				// update dead-end stack
			
 
				+				dead_end[dead_end_top + 0] = a;
			
 
				+				dead_end[dead_end_top + 1] = b;
			
 
				+				dead_end[dead_end_top + 2] = c;
			
 
				+				dead_end_top += 3;
			
 
				+
			
 
				+				// update live triangle counts
			
 
				+				live_triangles[a]--;
			
 
				+				live_triangles[b]--;
			
 
				+				live_triangles[c]--;
			
 
				+
			
 
				+				// update cache info
			
 
				+				// if vertex is not in cache, put it in cache
			
 
				+				if (timestamp - cache_timestamps[a] > cache_size)
			
 
				+					cache_timestamps[a] = timestamp++;
			
 
				+
			
 
				+				if (timestamp - cache_timestamps[b] > cache_size)
			
 
				+					cache_timestamps[b] = timestamp++;
			
 
				+
			
 
				+				if (timestamp - cache_timestamps[c] > cache_size)
			
 
				+					cache_timestamps[c] = timestamp++;
			
 
				+
			
 
				+				// update emitted flags
			
 
				+				emitted_flags[triangle] = true;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// next candidates are the ones we pushed to dead-end stack just now
			
 
				+		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
			
 
				+
			
 
				+		// get next vertex
			
 
				+		current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
			
 
				+
			
 
				+		if (current_vertex == ~0u)
			
 
				+		{
			
 
				+			current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	assert(output_triangle == face_count);
			
 
				+}
			
--- a/include/meshoptimizer/vertexcodec.cpp
+++ b/include/meshoptimizer/vertexcodec.cpp
@@ -0,0 +1,1329 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// The block below auto-detects SIMD ISA that can be used on the target platform
			
 
				+#ifndef MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				+// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
			
 
				+#if defined(__AVX__) || defined(__SSSE3__)
			
 
				+#define SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
			
 
				+#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
			
 
				+#undef SIMD_SSE
			
 
				+#define SIMD_AVX
			
 
				+#endif
			
 
				+
			
 
				+// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
			
 
				+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
			
 
				+#define SIMD_SSE
			
 
				+#define SIMD_FALLBACK
			
 
				+#endif
			
 
				+
			
 
				+// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
			
 
				+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
			
 
				+#define SIMD_SSE
			
 
				+#define SIMD_FALLBACK
			
 
				+#define SIMD_TARGET __attribute__((target("ssse3")))
			
 
				+#endif
			
 
				+
			
 
				+// GCC/clang define these when NEON support is available
			
 
				+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// On MSVC, we assume that ARM builds always target NEON-capable devices
			
 
				+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
			
 
				+#if defined(__wasm_simd128__)
			
 
				+#define SIMD_WASM
			
 
				+// Prevent compiling other variant when wasm simd compilation is active
			
 
				+#undef SIMD_NEON
			
 
				+#undef SIMD_SSE
			
 
				+#undef SIMD_AVX
			
 
				+#endif
			
 
				+
			
 
				+#ifndef SIMD_TARGET
			
 
				+#define SIMD_TARGET
			
 
				+#endif
			
 
				+
			
 
				+// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
			
 
				+// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
			
 
				+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
			
 
				+#define SIMD_LATENCYOPT
			
 
				+#endif
			
 
				+
			
 
				+#endif // !MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				+#ifdef SIMD_SSE
			
 
				+#include <tmmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
			
 
				+#ifdef _MSC_VER
			
 
				+#include <intrin.h> // __cpuid
			
 
				+#else
			
 
				+#include <cpuid.h> // __cpuid
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_AVX
			
 
				+#include <immintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+#if defined(_MSC_VER) && defined(_M_ARM64)
			
 
				+#include <arm64_neon.h>
			
 
				+#else
			
 
				+#include <arm_neon.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+#include <wasm_simd128.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifndef TRACE
			
 
				+#define TRACE 0
			
 
				+#endif
			
 
				+
			
 
				+#if TRACE
			
 
				+#include <stdio.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
			
 
				+#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
			
 
				+#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
			
 
				+#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
			
 
				+#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
			
 
				+#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
			
 
				+#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
			
 
				+#endif
			
 
				+
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+const unsigned char kVertexHeader = 0xa0;
			
 
				+
			
 
				+static int gEncodeVertexVersion = 0;
			
 
				+
			
 
				+const size_t kVertexBlockSizeBytes = 8192;
			
 
				+const size_t kVertexBlockMaxSize = 256;
			
 
				+const size_t kByteGroupSize = 16;
			
 
				+const size_t kByteGroupDecodeLimit = 24;
			
 
				+const size_t kTailMaxSize = 32;
			
 
				+
			
 
				+static size_t getVertexBlockSize(size_t vertex_size)
			
 
				+{
			
 
				+	// make sure the entire block fits into the scratch buffer
			
 
				+	size_t result = kVertexBlockSizeBytes / vertex_size;
			
 
				+
			
 
				+	// align to byte group size; we encode each byte as a byte group
			
 
				+	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
			
 
				+	result &= ~(kByteGroupSize - 1);
			
 
				+
			
 
				+	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
			
 
				+}
			
 
				+
			
 
				+inline unsigned char zigzag8(unsigned char v)
			
 
				+{
			
 
				+	return ((signed char)(v) >> 7) ^ (v << 1);
			
 
				+}
			
 
				+
			
 
				+inline unsigned char unzigzag8(unsigned char v)
			
 
				+{
			
 
				+	return -(v & 1) ^ (v >> 1);
			
 
				+}
			
 
				+
			
 
				+#if TRACE
			
 
				+struct Stats
			
 
				+{
			
 
				+	size_t size;
			
 
				+	size_t header;  // bytes for header
			
 
				+	size_t bitg[4]; // bytes for bit groups
			
 
				+	size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
			
 
				+};
			
 
				+
			
 
				+static Stats* bytestats = NULL;
			
 
				+static Stats vertexstats[256];
			
 
				+#endif
			
 
				+
			
 
				+static bool encodeBytesGroupZero(const unsigned char* buffer)
			
 
				+{
			
 
				+	for (size_t i = 0; i < kByteGroupSize; ++i)
			
 
				+		if (buffer[i])
			
 
				+			return false;
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
			
 
				+{
			
 
				+	assert(bits >= 1 && bits <= 8);
			
 
				+
			
 
				+	if (bits == 1)
			
 
				+		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
			
 
				+
			
 
				+	if (bits == 8)
			
 
				+		return kByteGroupSize;
			
 
				+
			
 
				+	size_t result = kByteGroupSize * bits / 8;
			
 
				+
			
 
				+	unsigned char sentinel = (1 << bits) - 1;
			
 
				+
			
 
				+	for (size_t i = 0; i < kByteGroupSize; ++i)
			
 
				+		result += buffer[i] >= sentinel;
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
			
 
				+{
			
 
				+	assert(bits >= 1 && bits <= 8);
			
 
				+
			
 
				+	if (bits == 1)
			
 
				+		return data;
			
 
				+
			
 
				+	if (bits == 8)
			
 
				+	{
			
 
				+		memcpy(data, buffer, kByteGroupSize);
			
 
				+		return data + kByteGroupSize;
			
 
				+	}
			
 
				+
			
 
				+	size_t byte_size = 8 / bits;
			
 
				+	assert(kByteGroupSize % byte_size == 0);
			
 
				+
			
 
				+	// fixed portion: bits bits for each value
			
 
				+	// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
			
 
				+	unsigned char sentinel = (1 << bits) - 1;
			
 
				+
			
 
				+	for (size_t i = 0; i < kByteGroupSize; i += byte_size)
			
 
				+	{
			
 
				+		unsigned char byte = 0;
			
 
				+
			
 
				+		for (size_t k = 0; k < byte_size; ++k)
			
 
				+		{
			
 
				+			unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
			
 
				+
			
 
				+			byte <<= bits;
			
 
				+			byte |= enc;
			
 
				+		}
			
 
				+
			
 
				+		*data++ = byte;
			
 
				+	}
			
 
				+
			
 
				+	for (size_t i = 0; i < kByteGroupSize; ++i)
			
 
				+	{
			
 
				+		if (buffer[i] >= sentinel)
			
 
				+		{
			
 
				+			*data++ = buffer[i];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	assert(buffer_size % kByteGroupSize == 0);
			
 
				+
			
 
				+	unsigned char* header = data;
			
 
				+
			
 
				+	// round number of groups to 4 to get number of header bytes
			
 
				+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
			
 
				+
			
 
				+	if (size_t(data_end - data) < header_size)
			
 
				+		return NULL;
			
 
				+
			
 
				+	data += header_size;
			
 
				+
			
 
				+	memset(header, 0, header_size);
			
 
				+
			
 
				+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
			
 
				+	{
			
 
				+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
			
 
				+			return NULL;
			
 
				+
			
 
				+		int best_bits = 8;
			
 
				+		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
			
 
				+
			
 
				+		for (int bits = 1; bits < 8; bits *= 2)
			
 
				+		{
			
 
				+			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
			
 
				+
			
 
				+			if (size < best_size)
			
 
				+			{
			
 
				+				best_bits = bits;
			
 
				+				best_size = size;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3));
			
 
				+		assert((1 << bitslog2) == best_bits);
			
 
				+
			
 
				+		size_t header_offset = i / kByteGroupSize;
			
 
				+
			
 
				+		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
			
 
				+
			
 
				+		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
			
 
				+
			
 
				+		assert(data + best_size == next);
			
 
				+		data = next;
			
 
				+
			
 
				+#if TRACE
			
 
				+		bytestats->bitg[bitslog2] += best_size;
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+#if TRACE
			
 
				+	bytestats->header += header_size;
			
 
				+#endif
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
			
 
				+{
			
 
				+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
			
 
				+
			
 
				+	unsigned char buffer[kVertexBlockMaxSize];
			
 
				+	assert(sizeof(buffer) % kByteGroupSize == 0);
			
 
				+
			
 
				+	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
			
 
				+	memset(buffer, 0, sizeof(buffer));
			
 
				+
			
 
				+	for (size_t k = 0; k < vertex_size; ++k)
			
 
				+	{
			
 
				+		size_t vertex_offset = k;
			
 
				+
			
 
				+		unsigned char p = last_vertex[k];
			
 
				+
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		{
			
 
				+			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
			
 
				+
			
 
				+			p = vertex_data[vertex_offset];
			
 
				+
			
 
				+			vertex_offset += vertex_size;
			
 
				+		}
			
 
				+
			
 
				+#if TRACE
			
 
				+		const unsigned char* olddata = data;
			
 
				+		bytestats = &vertexstats[k];
			
 
				+
			
 
				+		for (size_t ig = 0; ig < vertex_count; ig += kByteGroupSize)
			
 
				+		{
			
 
				+			unsigned char last = (ig == 0) ? last_vertex[k] : vertex_data[vertex_size * (ig - 1) + k];
			
 
				+			unsigned char delta = 0xff;
			
 
				+
			
 
				+			for (size_t i = ig; i < ig + kByteGroupSize && i < vertex_count; ++i)
			
 
				+				delta &= ~(vertex_data[vertex_size * i + k] ^ last);
			
 
				+
			
 
				+			for (int j = 0; j < 8; ++j)
			
 
				+				bytestats->bitc[j] += (vertex_count - ig < kByteGroupSize ? vertex_count - ig : kByteGroupSize) * ((delta >> j) & 1);
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				+		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
			
 
				+		if (!data)
			
 
				+			return NULL;
			
 
				+
			
 
				+#if TRACE
			
 
				+		bytestats = NULL;
			
 
				+		vertexstats[k].size += data - olddata;
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
			
 
				+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				+{
			
 
				+#define READ() byte = *data++
			
 
				+#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
			
 
				+
			
 
				+	unsigned char byte, enc, encv;
			
 
				+	const unsigned char* data_var;
			
 
				+
			
 
				+	switch (bitslog2)
			
 
				+	{
			
 
				+	case 0:
			
 
				+		memset(buffer, 0, kByteGroupSize);
			
 
				+		return data;
			
 
				+	case 1:
			
 
				+		data_var = data + 4;
			
 
				+
			
 
				+		// 4 groups with 4 2-bit values in each byte
			
 
				+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
			
 
				+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
			
 
				+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
			
 
				+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
			
 
				+
			
 
				+		return data_var;
			
 
				+	case 2:
			
 
				+		data_var = data + 8;
			
 
				+
			
 
				+		// 8 groups with 2 4-bit values in each byte
			
 
				+		READ(), NEXT(4), NEXT(4);
			
 
				+		READ(), NEXT(4), NEXT(4);
			
 
				+		READ(), NEXT(4), NEXT(4);
			
 
				+		READ(), NEXT(4), NEXT(4);
			
 
				+		READ(), NEXT(4), NEXT(4);
			
 
				+		READ(), NEXT(4), NEXT(4);
			
 
				+		READ(), NEXT(4), NEXT(4);
			
 
				+		READ(), NEXT(4), NEXT(4);
			
 
				+
			
 
				+		return data_var;
			
 
				+	case 3:
			
 
				+		memcpy(buffer, data, kByteGroupSize);
			
 
				+		return data + kByteGroupSize;
			
 
				+	default:
			
 
				+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
			
 
				+		return data;
			
 
				+	}
			
 
				+
			
 
				+#undef READ
			
 
				+#undef NEXT
			
 
				+}
			
 
				+
			
 
				+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	assert(buffer_size % kByteGroupSize == 0);
			
 
				+
			
 
				+	const unsigned char* header = data;
			
 
				+
			
 
				+	// round number of groups to 4 to get number of header bytes
			
 
				+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
			
 
				+
			
 
				+	if (size_t(data_end - data) < header_size)
			
 
				+		return NULL;
			
 
				+
			
 
				+	data += header_size;
			
 
				+
			
 
				+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
			
 
				+	{
			
 
				+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
			
 
				+			return NULL;
			
 
				+
			
 
				+		size_t header_offset = i / kByteGroupSize;
			
 
				+
			
 
				+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
			
 
				+
			
 
				+		data = decodeBytesGroup(data, buffer + i, bitslog2);
			
 
				+	}
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
			
 
				+{
			
 
				+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
			
 
				+
			
 
				+	unsigned char buffer[kVertexBlockMaxSize];
			
 
				+	unsigned char transposed[kVertexBlockSizeBytes];
			
 
				+
			
 
				+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
			
 
				+	assert(vertex_count <= vertex_count_aligned);
			
 
				+
			
 
				+	for (size_t k = 0; k < vertex_size; ++k)
			
 
				+	{
			
 
				+		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
			
 
				+		if (!data)
			
 
				+			return NULL;
			
 
				+
			
 
				+		size_t vertex_offset = k;
			
 
				+
			
 
				+		unsigned char p = last_vertex[k];
			
 
				+
			
 
				+		for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		{
			
 
				+			unsigned char v = unzigzag8(buffer[i]) + p;
			
 
				+
			
 
				+			transposed[vertex_offset] = v;
			
 
				+			p = v;
			
 
				+
			
 
				+			vertex_offset += vertex_size;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
			
 
				+
			
 
				+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+static unsigned char kDecodeBytesGroupShuffle[256][8];
			
 
				+static unsigned char kDecodeBytesGroupCount[256];
			
 
				+
			
 
				+#ifdef __wasm__
			
 
				+__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
			
 
				+#endif
			
 
				+static bool
			
 
				+decodeBytesGroupBuildTables()
			
 
				+{
			
 
				+	for (int mask = 0; mask < 256; ++mask)
			
 
				+	{
			
 
				+		unsigned char shuffle[8];
			
 
				+		unsigned char count = 0;
			
 
				+
			
 
				+		for (int i = 0; i < 8; ++i)
			
 
				+		{
			
 
				+			int maski = (mask >> i) & 1;
			
 
				+			shuffle[i] = maski ? count : 0x80;
			
 
				+			count += (unsigned char)(maski);
			
 
				+		}
			
 
				+
			
 
				+		memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
			
 
				+		kDecodeBytesGroupCount[mask] = count;
			
 
				+	}
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_SSE
			
 
				+SIMD_TARGET
			
 
				+static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
			
 
				+{
			
 
				+	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
			
 
				+	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
			
 
				+	__m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
			
 
				+
			
 
				+	__m128i sm1r = _mm_add_epi8(sm1, sm1off);
			
 
				+
			
 
				+	return _mm_unpacklo_epi64(sm0, sm1r);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				+{
			
 
				+	switch (bitslog2)
			
 
				+	{
			
 
				+	case 0:
			
 
				+	{
			
 
				+		__m128i result = _mm_setzero_si128();
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+		return data;
			
 
				+	}
			
 
				+
			
 
				+	case 1:
			
 
				+	{
			
 
				+#ifdef __GNUC__
			
 
				+		typedef int __attribute__((aligned(1))) unaligned_int;
			
 
				+#else
			
 
				+		typedef int unaligned_int;
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned int data32;
			
 
				+		memcpy(&data32, data, 4);
			
 
				+		data32 &= data32 >> 1;
			
 
				+
			
 
				+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
			
 
				+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				+		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
			
 
				+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
			
 
				+
			
 
				+		__m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
			
 
				+		__m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
			
 
				+		__m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
			
 
				+
			
 
				+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
			
 
				+		int mask16 = _mm_movemask_epi8(mask);
			
 
				+		unsigned char mask0 = (unsigned char)(mask16 & 255);
			
 
				+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
			
 
				+
			
 
				+		__m128i shuf = decodeShuffleMask(mask0, mask1);
			
 
				+
			
 
				+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 4 + datacnt;
			
 
				+#else
			
 
				+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	case 2:
			
 
				+	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned long long data64;
			
 
				+		memcpy(&data64, data, 8);
			
 
				+		data64 &= data64 >> 1;
			
 
				+		data64 &= data64 >> 2;
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				+		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
			
 
				+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
			
 
				+
			
 
				+		__m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
			
 
				+		__m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
			
 
				+
			
 
				+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
			
 
				+		int mask16 = _mm_movemask_epi8(mask);
			
 
				+		unsigned char mask0 = (unsigned char)(mask16 & 255);
			
 
				+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
			
 
				+
			
 
				+		__m128i shuf = decodeShuffleMask(mask0, mask1);
			
 
				+
			
 
				+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 8 + datacnt;
			
 
				+#else
			
 
				+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	case 3:
			
 
				+	{
			
 
				+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+		return data + 16;
			
 
				+	}
			
 
				+
			
 
				+	default:
			
 
				+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
			
 
				+		return data;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_AVX
			
 
				+static const __m128i decodeBytesGroupConfig[] = {
			
 
				+    _mm_set1_epi8(3),
			
 
				+    _mm_set1_epi8(15),
			
 
				+    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
			
 
				+    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
			
 
				+};
			
 
				+
			
 
				+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				+{
			
 
				+	switch (bitslog2)
			
 
				+	{
			
 
				+	case 0:
			
 
				+	{
			
 
				+		__m128i result = _mm_setzero_si128();
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+		return data;
			
 
				+	}
			
 
				+
			
 
				+	case 1:
			
 
				+	case 2:
			
 
				+	{
			
 
				+		const unsigned char* skip = data + (bitslog2 << 2);
			
 
				+
			
 
				+		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
			
 
				+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
			
 
				+
			
 
				+		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
			
 
				+		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
			
 
				+
			
 
				+		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
			
 
				+		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
			
 
				+		__mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
			
 
				+
			
 
				+		__m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+		return skip + _mm_popcnt_u32(mask16);
			
 
				+	}
			
 
				+
			
 
				+	case 3:
			
 
				+	{
			
 
				+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				+
			
 
				+		return data + 16;
			
 
				+	}
			
 
				+
			
 
				+	default:
			
 
				+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
			
 
				+		return data;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
			
 
				+{
			
 
				+	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
			
 
				+	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
			
 
				+
			
 
				+	uint8x8_t r0 = vtbl1_u8(rest0, sm0);
			
 
				+	uint8x8_t r1 = vtbl1_u8(rest1, sm1);
			
 
				+
			
 
				+	return vcombine_u8(r0, r1);
			
 
				+}
			
 
				+
			
 
				+static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
			
 
				+{
			
 
				+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
			
 
				+	const uint64_t magic = 0x000103070f1f3f80ull;
			
 
				+
			
 
				+	uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
			
 
				+
			
 
				+	mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
			
 
				+	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
			
 
				+}
			
 
				+
			
 
				+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				+{
			
 
				+	switch (bitslog2)
			
 
				+	{
			
 
				+	case 0:
			
 
				+	{
			
 
				+		uint8x16_t result = vdupq_n_u8(0);
			
 
				+
			
 
				+		vst1q_u8(buffer, result);
			
 
				+
			
 
				+		return data;
			
 
				+	}
			
 
				+
			
 
				+	case 1:
			
 
				+	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned int data32;
			
 
				+		memcpy(&data32, data, 4);
			
 
				+		data32 &= data32 >> 1;
			
 
				+
			
 
				+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
			
 
				+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				+		uint8x8_t sel2 = vld1_u8(data);
			
 
				+		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
			
 
				+		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
			
 
				+		uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
			
 
				+
			
 
				+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
			
 
				+		unsigned char mask0, mask1;
			
 
				+		neonMoveMask(mask, mask0, mask1);
			
 
				+
			
 
				+		uint8x8_t rest0 = vld1_u8(data + 4);
			
 
				+		uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
			
 
				+
			
 
				+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
			
 
				+
			
 
				+		vst1q_u8(buffer, result);
			
 
				+
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 4 + datacnt;
			
 
				+#else
			
 
				+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	case 2:
			
 
				+	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned long long data64;
			
 
				+		memcpy(&data64, data, 8);
			
 
				+		data64 &= data64 >> 1;
			
 
				+		data64 &= data64 >> 2;
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				+		uint8x8_t sel4 = vld1_u8(data);
			
 
				+		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
			
 
				+		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
			
 
				+
			
 
				+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
			
 
				+		unsigned char mask0, mask1;
			
 
				+		neonMoveMask(mask, mask0, mask1);
			
 
				+
			
 
				+		uint8x8_t rest0 = vld1_u8(data + 8);
			
 
				+		uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
			
 
				+
			
 
				+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
			
 
				+
			
 
				+		vst1q_u8(buffer, result);
			
 
				+
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 8 + datacnt;
			
 
				+#else
			
 
				+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	case 3:
			
 
				+	{
			
 
				+		uint8x16_t result = vld1q_u8(data);
			
 
				+
			
 
				+		vst1q_u8(buffer, result);
			
 
				+
			
 
				+		return data + 16;
			
 
				+	}
			
 
				+
			
 
				+	default:
			
 
				+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
			
 
				+		return data;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+SIMD_TARGET
			
 
				+static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
			
 
				+{
			
 
				+	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
			
 
				+	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
			
 
				+
			
 
				+	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
			
 
				+	sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
			
 
				+
			
 
				+	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
			
 
				+
			
 
				+	return wasmx_unpacklo_v64x2(sm0, sm1r);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
			
 
				+{
			
 
				+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
			
 
				+	const uint64_t magic = 0x000103070f1f3f80ull;
			
 
				+
			
 
				+	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
			
 
				+	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
 
				+{
			
 
				+	switch (bitslog2)
			
 
				+	{
			
 
				+	case 0:
			
 
				+	{
			
 
				+		v128_t result = wasm_i8x16_splat(0);
			
 
				+
			
 
				+		wasm_v128_store(buffer, result);
			
 
				+
			
 
				+		return data;
			
 
				+	}
			
 
				+
			
 
				+	case 1:
			
 
				+	{
			
 
				+		v128_t sel2 = wasm_v128_load(data);
			
 
				+		v128_t rest = wasm_v128_load(data + 4);
			
 
				+
			
 
				+		v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
			
 
				+		v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
			
 
				+		v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
			
 
				+
			
 
				+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
			
 
				+
			
 
				+		unsigned char mask0, mask1;
			
 
				+		wasmMoveMask(mask, mask0, mask1);
			
 
				+
			
 
				+		v128_t shuf = decodeShuffleMask(mask0, mask1);
			
 
				+
			
 
				+		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
			
 
				+
			
 
				+		wasm_v128_store(buffer, result);
			
 
				+
			
 
				+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+	}
			
 
				+
			
 
				+	case 2:
			
 
				+	{
			
 
				+		v128_t sel4 = wasm_v128_load(data);
			
 
				+		v128_t rest = wasm_v128_load(data + 8);
			
 
				+
			
 
				+		v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
			
 
				+		v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
			
 
				+
			
 
				+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
			
 
				+
			
 
				+		unsigned char mask0, mask1;
			
 
				+		wasmMoveMask(mask, mask0, mask1);
			
 
				+
			
 
				+		v128_t shuf = decodeShuffleMask(mask0, mask1);
			
 
				+
			
 
				+		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
			
 
				+
			
 
				+		wasm_v128_store(buffer, result);
			
 
				+
			
 
				+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+	}
			
 
				+
			
 
				+	case 3:
			
 
				+	{
			
 
				+		v128_t result = wasm_v128_load(data);
			
 
				+
			
 
				+		wasm_v128_store(buffer, result);
			
 
				+
			
 
				+		return data + 16;
			
 
				+	}
			
 
				+
			
 
				+	default:
			
 
				+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
			
 
				+		return data;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_AVX)
			
 
				+SIMD_TARGET
			
 
				+static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
			
 
				+{
			
 
				+	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
			
 
				+	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
			
 
				+	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
			
 
				+	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
			
 
				+
			
 
				+	x0 = _mm_unpacklo_epi16(t0, t2);
			
 
				+	x1 = _mm_unpackhi_epi16(t0, t2);
			
 
				+	x2 = _mm_unpacklo_epi16(t1, t3);
			
 
				+	x3 = _mm_unpackhi_epi16(t1, t3);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static __m128i unzigzag8(__m128i v)
			
 
				+{
			
 
				+	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
			
 
				+	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
			
 
				+
			
 
				+	return _mm_xor_si128(xl, xr);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
			
 
				+{
			
 
				+	uint8x16x2_t t01 = vzipq_u8(x0, x1);
			
 
				+	uint8x16x2_t t23 = vzipq_u8(x2, x3);
			
 
				+
			
 
				+	uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
			
 
				+	uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
			
 
				+
			
 
				+	x0 = vreinterpretq_u8_u16(x01.val[0]);
			
 
				+	x1 = vreinterpretq_u8_u16(x01.val[1]);
			
 
				+	x2 = vreinterpretq_u8_u16(x23.val[0]);
			
 
				+	x3 = vreinterpretq_u8_u16(x23.val[1]);
			
 
				+}
			
 
				+
			
 
				+static uint8x16_t unzigzag8(uint8x16_t v)
			
 
				+{
			
 
				+	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
			
 
				+	uint8x16_t xr = vshrq_n_u8(v, 1);
			
 
				+
			
 
				+	return veorq_u8(xl, xr);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+SIMD_TARGET
			
 
				+static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
			
 
				+{
			
 
				+	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
			
 
				+	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
			
 
				+	v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
			
 
				+	v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
			
 
				+
			
 
				+	x0 = wasmx_unpacklo_v16x8(t0, t2);
			
 
				+	x1 = wasmx_unpackhi_v16x8(t0, t2);
			
 
				+	x2 = wasmx_unpacklo_v16x8(t1, t3);
			
 
				+	x3 = wasmx_unpackhi_v16x8(t1, t3);
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static v128_t unzigzag8(v128_t v)
			
 
				+{
			
 
				+	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
			
 
				+	v128_t xr = wasm_u8x16_shr(v, 1);
			
 
				+
			
 
				+	return wasm_v128_xor(xl, xr);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+SIMD_TARGET
			
 
				+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	assert(buffer_size % kByteGroupSize == 0);
			
 
				+	assert(kByteGroupSize == 16);
			
 
				+
			
 
				+	const unsigned char* header = data;
			
 
				+
			
 
				+	// round number of groups to 4 to get number of header bytes
			
 
				+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
			
 
				+
			
 
				+	if (size_t(data_end - data) < header_size)
			
 
				+		return NULL;
			
 
				+
			
 
				+	data += header_size;
			
 
				+
			
 
				+	size_t i = 0;
			
 
				+
			
 
				+	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
			
 
				+	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
			
 
				+	{
			
 
				+		size_t header_offset = i / kByteGroupSize;
			
 
				+		unsigned char header_byte = header[header_offset / 4];
			
 
				+
			
 
				+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
			
 
				+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
			
 
				+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
			
 
				+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
			
 
				+	}
			
 
				+
			
 
				+	// slow-path: process remaining groups
			
 
				+	for (; i < buffer_size; i += kByteGroupSize)
			
 
				+	{
			
 
				+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
			
 
				+			return NULL;
			
 
				+
			
 
				+		size_t header_offset = i / kByteGroupSize;
			
 
				+
			
 
				+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
			
 
				+
			
 
				+		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
			
 
				+	}
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+SIMD_TARGET
			
 
				+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
			
 
				+{
			
 
				+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
			
 
				+
			
 
				+	unsigned char buffer[kVertexBlockMaxSize * 4];
			
 
				+	unsigned char transposed[kVertexBlockSizeBytes];
			
 
				+
			
 
				+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
			
 
				+
			
 
				+	for (size_t k = 0; k < vertex_size; k += 4)
			
 
				+	{
			
 
				+		for (size_t j = 0; j < 4; ++j)
			
 
				+		{
			
 
				+			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
			
 
				+			if (!data)
			
 
				+				return NULL;
			
 
				+		}
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_AVX)
			
 
				+#define TEMP __m128i
			
 
				+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
			
 
				+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
			
 
				+#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
			
 
				+#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
			
 
				+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+#define TEMP uint8x8_t
			
 
				+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
			
 
				+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
			
 
				+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
			
 
				+#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
			
 
				+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+#define TEMP v128_t
			
 
				+#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
			
 
				+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
			
 
				+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
			
 
				+#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
			
 
				+#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
			
 
				+#endif
			
 
				+
			
 
				+		PREP();
			
 
				+
			
 
				+		unsigned char* savep = transposed + k;
			
 
				+
			
 
				+		for (size_t j = 0; j < vertex_count_aligned; j += 16)
			
 
				+		{
			
 
				+			LOAD(0);
			
 
				+			LOAD(1);
			
 
				+			LOAD(2);
			
 
				+			LOAD(3);
			
 
				+
			
 
				+			r0 = unzigzag8(r0);
			
 
				+			r1 = unzigzag8(r1);
			
 
				+			r2 = unzigzag8(r2);
			
 
				+			r3 = unzigzag8(r3);
			
 
				+
			
 
				+			transpose8(r0, r1, r2, r3);
			
 
				+
			
 
				+			TEMP t0, t1, t2, t3;
			
 
				+
			
 
				+			GRP4(0);
			
 
				+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
			
 
				+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
			
 
				+
			
 
				+			GRP4(1);
			
 
				+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
			
 
				+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
			
 
				+
			
 
				+			GRP4(2);
			
 
				+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
			
 
				+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
			
 
				+
			
 
				+			GRP4(3);
			
 
				+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
			
 
				+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
			
 
				+
			
 
				+#undef TEMP
			
 
				+#undef PREP
			
 
				+#undef LOAD
			
 
				+#undef GRP4
			
 
				+#undef FIXD
			
 
				+#undef SAVE
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
			
 
				+
			
 
				+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
			
 
				+static unsigned int getCpuFeatures()
			
 
				+{
			
 
				+	int cpuinfo[4] = {};
			
 
				+#ifdef _MSC_VER
			
 
				+	__cpuid(cpuinfo, 1);
			
 
				+#else
			
 
				+	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
			
 
				+#endif
			
 
				+	return cpuinfo[2];
			
 
				+}
			
 
				+
			
 
				+static unsigned int cpuid = getCpuFeatures();
			
 
				+#endif
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				+	assert(vertex_size % 4 == 0);
			
 
				+
			
 
				+#if TRACE
			
 
				+	memset(vertexstats, 0, sizeof(vertexstats));
			
 
				+#endif
			
 
				+
			
 
				+	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
			
 
				+
			
 
				+	unsigned char* data = buffer;
			
 
				+	unsigned char* data_end = buffer + buffer_size;
			
 
				+
			
 
				+	if (size_t(data_end - data) < 1 + vertex_size)
			
 
				+		return 0;
			
 
				+
			
 
				+	int version = gEncodeVertexVersion;
			
 
				+
			
 
				+	*data++ = (unsigned char)(kVertexHeader | version);
			
 
				+
			
 
				+	unsigned char first_vertex[256] = {};
			
 
				+	if (vertex_count > 0)
			
 
				+		memcpy(first_vertex, vertex_data, vertex_size);
			
 
				+
			
 
				+	unsigned char last_vertex[256] = {};
			
 
				+	memcpy(last_vertex, first_vertex, vertex_size);
			
 
				+
			
 
				+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
			
 
				+
			
 
				+	size_t vertex_offset = 0;
			
 
				+
			
 
				+	while (vertex_offset < vertex_count)
			
 
				+	{
			
 
				+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
			
 
				+
			
 
				+		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
			
 
				+		if (!data)
			
 
				+			return 0;
			
 
				+
			
 
				+		vertex_offset += block_size;
			
 
				+	}
			
 
				+
			
 
				+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
			
 
				+
			
 
				+	if (size_t(data_end - data) < tail_size)
			
 
				+		return 0;
			
 
				+
			
 
				+	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
			
 
				+	if (vertex_size < kTailMaxSize)
			
 
				+	{
			
 
				+		memset(data, 0, kTailMaxSize - vertex_size);
			
 
				+		data += kTailMaxSize - vertex_size;
			
 
				+	}
			
 
				+
			
 
				+	memcpy(data, first_vertex, vertex_size);
			
 
				+	data += vertex_size;
			
 
				+
			
 
				+	assert(data >= buffer + tail_size);
			
 
				+	assert(data <= buffer + buffer_size);
			
 
				+
			
 
				+#if TRACE
			
 
				+	size_t total_size = data - buffer;
			
 
				+
			
 
				+	for (size_t k = 0; k < vertex_size; ++k)
			
 
				+	{
			
 
				+		const Stats& vsk = vertexstats[k];
			
 
				+
			
 
				+		printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
			
 
				+
			
 
				+		size_t total_k = vsk.header + vsk.bitg[0] + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[3];
			
 
				+
			
 
				+		printf(" |\thdr [%5.1f%%] bitg 1-3 [%4.1f%% %4.1f%% %4.1f%%]",
			
 
				+		    double(vsk.header) / double(total_k) * 100, double(vsk.bitg[1]) / double(total_k) * 100,
			
 
				+		    double(vsk.bitg[2]) / double(total_k) * 100, double(vsk.bitg[3]) / double(total_k) * 100);
			
 
				+
			
 
				+		printf(" |\tbitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
			
 
				+		    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
			
 
				+		    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
			
 
				+		    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
			
 
				+		    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
			
 
				+		printf("\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return data - buffer;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				+	assert(vertex_size % 4 == 0);
			
 
				+
			
 
				+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
			
 
				+	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
			
 
				+
			
 
				+	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
			
 
				+	size_t vertex_block_data_size = vertex_block_size;
			
 
				+
			
 
				+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
			
 
				+
			
 
				+	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
			
 
				+}
			
 
				+
			
 
				+void meshopt_encodeVertexVersion(int version)
			
 
				+{
			
 
				+	assert(unsigned(version) <= 0);
			
 
				+
			
 
				+	meshopt::gEncodeVertexVersion = version;
			
 
				+}
			
 
				+
			
 
				+int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				+	assert(vertex_size % 4 == 0);
			
 
				+
			
 
				+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
			
 
				+
			
 
				+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
			
 
				+	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
			
 
				+#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	decode = decodeVertexBlockSimd;
			
 
				+#else
			
 
				+	decode = decodeVertexBlock;
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	assert(gDecodeBytesGroupInitialized);
			
 
				+	(void)gDecodeBytesGroupInitialized;
			
 
				+#endif
			
 
				+
			
 
				+	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
			
 
				+
			
 
				+	const unsigned char* data = buffer;
			
 
				+	const unsigned char* data_end = buffer + buffer_size;
			
 
				+
			
 
				+	if (size_t(data_end - data) < 1 + vertex_size)
			
 
				+		return -2;
			
 
				+
			
 
				+	unsigned char data_header = *data++;
			
 
				+
			
 
				+	if ((data_header & 0xf0) != kVertexHeader)
			
 
				+		return -1;
			
 
				+
			
 
				+	int version = data_header & 0x0f;
			
 
				+	if (version > 0)
			
 
				+		return -1;
			
 
				+
			
 
				+	unsigned char last_vertex[256];
			
 
				+	memcpy(last_vertex, data_end - vertex_size, vertex_size);
			
 
				+
			
 
				+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
			
 
				+
			
 
				+	size_t vertex_offset = 0;
			
 
				+
			
 
				+	while (vertex_offset < vertex_count)
			
 
				+	{
			
 
				+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
			
 
				+
			
 
				+		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
			
 
				+		if (!data)
			
 
				+			return -2;
			
 
				+
			
 
				+		vertex_offset += block_size;
			
 
				+	}
			
 
				+
			
 
				+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
			
 
				+
			
 
				+	if (size_t(data_end - data) != tail_size)
			
 
				+		return -3;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#undef SIMD_NEON
			
 
				+#undef SIMD_SSE
			
 
				+#undef SIMD_AVX
			
 
				+#undef SIMD_WASM
			
 
				+#undef SIMD_FALLBACK
			
 
				+#undef SIMD_TARGET
			
 
				+#undef SIMD_LATENCYOPT
			
--- a/include/meshoptimizer/vertexfilter.cpp
+++ b/include/meshoptimizer/vertexfilter.cpp
@@ -0,0 +1,1046 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+// The block below auto-detects SIMD ISA that can be used on the target platform
			
 
				+#ifndef MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				+// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
			
 
				+#if defined(__SSE2__)
			
 
				+#define SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
			
 
				+#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
			
 
				+#define SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+// GCC/clang define these when NEON support is available
			
 
				+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// On MSVC, we assume that ARM builds always target NEON-capable devices
			
 
				+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
			
 
				+#if defined(__wasm_simd128__)
			
 
				+#define SIMD_WASM
			
 
				+// Prevent compiling other variant when wasm simd compilation is active
			
 
				+#undef SIMD_NEON
			
 
				+#undef SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+#endif // !MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				+#ifdef SIMD_SSE
			
 
				+#include <emmintrin.h>
			
 
				+#include <stdint.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef _MSC_VER
			
 
				+#include <intrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+#if defined(_MSC_VER) && defined(_M_ARM64)
			
 
				+#include <arm64_neon.h>
			
 
				+#else
			
 
				+#include <arm_neon.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+#undef __DEPRECATED
			
 
				+#include <wasm_simd128.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
			
 
				+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
			
 
				+#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
			
 
				+#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __has_builtin
			
 
				+#define __has_builtin(x) 0
			
 
				+#endif
			
 
				+
			
 
				+namespace meshopt
			
 
				+{
			
 
				+
			
 
				+#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
			
 
				+template <typename T>
			
 
				+static void decodeFilterOct(T* data, size_t count)
			
 
				+{
			
 
				+	const float max = float((1 << (sizeof(T) * 8 - 1)) - 1);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		float x = float(data[i * 4 + 0]);
			
 
				+		float y = float(data[i * 4 + 1]);
			
 
				+		float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y);
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		float t = (z >= 0.f) ? 0.f : z;
			
 
				+
			
 
				+		x += (x >= 0.f) ? t : -t;
			
 
				+		y += (y >= 0.f) ? t : -t;
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		float l = sqrtf(x * x + y * y + z * z);
			
 
				+		float s = max / l;
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f));
			
 
				+		int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f));
			
 
				+		int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f));
			
 
				+
			
 
				+		data[i * 4 + 0] = T(xf);
			
 
				+		data[i * 4 + 1] = T(yf);
			
 
				+		data[i * 4 + 2] = T(zf);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuat(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		// recover scale from the high byte of the component
			
 
				+		int sf = data[i * 4 + 3] | 3;
			
 
				+		float ss = scale / float(sf);
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		float x = float(data[i * 4 + 0]) * ss;
			
 
				+		float y = float(data[i * 4 + 1]) * ss;
			
 
				+		float z = float(data[i * 4 + 2]) * ss;
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		float ww = 1.f - x * x - y * y - z * z;
			
 
				+		float w = sqrtf(ww >= 0.f ? ww : 0.f);
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
			
 
				+		int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
			
 
				+		int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
			
 
				+		int wf = int(w * 32767.f + 0.5f);
			
 
				+
			
 
				+		int qc = data[i * 4 + 3] & 3;
			
 
				+
			
 
				+		// output order is dictated by input index
			
 
				+		data[i * 4 + ((qc + 1) & 3)] = short(xf);
			
 
				+		data[i * 4 + ((qc + 2) & 3)] = short(yf);
			
 
				+		data[i * 4 + ((qc + 3) & 3)] = short(zf);
			
 
				+		data[i * 4 + ((qc + 0) & 3)] = short(wf);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExp(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		unsigned int v = data[i];
			
 
				+
			
 
				+		// decode mantissa and exponent
			
 
				+		int m = int(v << 8) >> 8;
			
 
				+		int e = int(v) >> 24;
			
 
				+
			
 
				+		union
			
 
				+		{
			
 
				+			float f;
			
 
				+			unsigned int ui;
			
 
				+		} u;
			
 
				+
			
 
				+		// optimized version of ldexp(float(m), e)
			
 
				+		u.ui = unsigned(e + 127) << 23;
			
 
				+		u.f = u.f * float(m);
			
 
				+
			
 
				+		data[i] = u.ui;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+template <typename T>
			
 
				+static void dispatchSimd(void (*process)(T*, size_t), T* data, size_t count, size_t stride)
			
 
				+{
			
 
				+	assert(stride <= 4);
			
 
				+
			
 
				+	size_t count4 = count & ~size_t(3);
			
 
				+	process(data, count4);
			
 
				+
			
 
				+	if (count4 < count)
			
 
				+	{
			
 
				+		T tail[4 * 4] = {}; // max stride 4, max count 4
			
 
				+		size_t tail_size = (count - count4) * stride * sizeof(T);
			
 
				+		assert(tail_size <= sizeof(tail));
			
 
				+
			
 
				+		memcpy(tail, data + count4 * stride, tail_size);
			
 
				+		process(tail, count - count4);
			
 
				+		memcpy(data + count4 * stride, tail, tail_size);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+inline uint64_t rotateleft64(uint64_t v, int x)
			
 
				+{
			
 
				+#if defined(_MSC_VER) && !defined(__clang__)
			
 
				+	return _rotl64(v, x);
			
 
				+#elif defined(__clang__) && __has_builtin(__builtin_rotateleft64)
			
 
				+	return __builtin_rotateleft64(v, x);
			
 
				+#else
			
 
				+	return (v << (x & 63)) | (v >> ((64 - x) & 63));
			
 
				+#endif
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_SSE
			
 
				+static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				+{
			
 
				+	const __m128 sign = _mm_set1_ps(-0.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
			
 
				+		__m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
			
 
				+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		__m128 x = _mm_cvtepi32_ps(xf);
			
 
				+		__m128 y = _mm_cvtepi32_ps(yf);
			
 
				+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
			
 
				+
			
 
				+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
			
 
				+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
			
 
				+		__m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+
			
 
				+		// combine xr/yr/zr into final value
			
 
				+		__m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
			
 
				+		res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterOctSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const __m128 sign = _mm_set1_ps(-0.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
			
 
				+		__m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		__m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
			
 
				+		__m128i yf = _mm_srai_epi32(n4, 16);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we don't need to sign extend it
			
 
				+		__m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
			
 
				+		__m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		__m128 x = _mm_cvtepi32_ps(xf);
			
 
				+		__m128 y = _mm_cvtepi32_ps(yf);
			
 
				+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
			
 
				+
			
 
				+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
			
 
				+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
			
 
				+		__m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+
			
 
				+		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
			
 
				+		__m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
			
 
				+
			
 
				+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
 
				+		__m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
			
 
				+		__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
			
 
				+
			
 
				+		// patch in .w
			
 
				+		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
			
 
				+		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
			
 
				+		__m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		__m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
			
 
				+		__m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
			
 
				+		__m128i yf = _mm_srai_epi32(q4_xy, 16);
			
 
				+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
			
 
				+		__m128i cf = _mm_srai_epi32(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
			
 
				+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
			
 
				+		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
			
 
				+		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
			
 
				+		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
			
 
				+
			
 
				+		__m128 s = _mm_set1_ps(32767.f);
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
			
 
				+
			
 
				+		// mix x/z and w/y to make 16-bit unpack easier
			
 
				+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
			
 
				+		__m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
			
 
				+
			
 
				+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				+		__m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
			
 
				+		__m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
			
 
				+
			
 
				+		// store results to stack so that we can rotate using scalar instructions
			
 
				+		uint64_t res[4];
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
			
 
				+
			
 
				+		// rotate and store
			
 
				+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
			
 
				+
			
 
				+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
			
 
				+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
			
 
				+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
			
 
				+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExpSimd(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
			
 
				+
			
 
				+		// decode exponent into 2^x directly
			
 
				+		__m128i ef = _mm_srai_epi32(v, 24);
			
 
				+		__m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
			
 
				+
			
 
				+		// decode 24-bit mantissa into floating-point value
			
 
				+		__m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
			
 
				+		__m128 m = _mm_cvtepi32_ps(mf);
			
 
				+
			
 
				+		__m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
			
 
				+
			
 
				+		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
			
 
				+inline float32x4_t vsqrtq_f32(float32x4_t x)
			
 
				+{
			
 
				+	float32x4_t r = vrsqrteq_f32(x);
			
 
				+	r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
			
 
				+	return vmulq_f32(r, x);
			
 
				+}
			
 
				+
			
 
				+inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
			
 
				+{
			
 
				+	float32x4_t r = vrecpeq_f32(y);
			
 
				+	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
			
 
				+	return vmulq_f32(x, r);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				+{
			
 
				+	const int32x4_t sign = vdupq_n_s32(0x80000000);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
			
 
				+		int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
			
 
				+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		float32x4_t x = vcvtq_f32_s32(xf);
			
 
				+		float32x4_t y = vcvtq_f32_s32(yf);
			
 
				+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
			
 
				+
			
 
				+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
			
 
				+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
			
 
				+		float32x4_t rl = vrsqrteq_f32(ll);
			
 
				+		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+
			
 
				+		// combine xr/yr/zr into final value
			
 
				+		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
			
 
				+		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
			
 
				+
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterOctSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const int32x4_t sign = vdupq_n_s32(0x80000000);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
			
 
				+		int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
			
 
				+		int32x4_t yf = vshrq_n_s32(n4, 16);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we don't need to sign extend it
			
 
				+		int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
			
 
				+		int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		float32x4_t x = vcvtq_f32_s32(xf);
			
 
				+		float32x4_t y = vcvtq_f32_s32(yf);
			
 
				+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
			
 
				+
			
 
				+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
			
 
				+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
			
 
				+		float32x4_t rl = vrsqrteq_f32(ll);
			
 
				+		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
			
 
				+		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+
			
 
				+		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
			
 
				+		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
			
 
				+
			
 
				+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
 
				+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
			
 
				+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
			
 
				+
			
 
				+		// patch in .w
			
 
				+		res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
			
 
				+		res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
			
 
				+
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
			
 
				+		int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
			
 
				+		int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
			
 
				+		int32x4_t yf = vshrq_n_s32(q4_xy, 16);
			
 
				+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
			
 
				+		int32x4_t cf = vshrq_n_s32(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
			
 
				+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
			
 
				+		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
			
 
				+		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
			
 
				+		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
			
 
				+
			
 
				+		float32x4_t s = vdupq_n_f32(32767.f);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
			
 
				+
			
 
				+		// mix x/z and w/y to make 16-bit unpack easier
			
 
				+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
			
 
				+		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
			
 
				+
			
 
				+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
			
 
				+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
			
 
				+
			
 
				+		// rotate and store
			
 
				+		uint64_t* out = (uint64_t*)&data[i * 4];
			
 
				+
			
 
				+		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
			
 
				+		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
			
 
				+		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
			
 
				+		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExpSimd(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
			
 
				+
			
 
				+		// decode exponent into 2^x directly
			
 
				+		int32x4_t ef = vshrq_n_s32(v, 24);
			
 
				+		int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
			
 
				+
			
 
				+		// decode 24-bit mantissa into floating-point value
			
 
				+		int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
			
 
				+		float32x4_t m = vcvtq_f32_s32(mf);
			
 
				+
			
 
				+		float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
			
 
				+
			
 
				+		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_WASM
			
 
				+static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				+{
			
 
				+	const v128_t sign = wasm_f32x4_splat(-0.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t n4 = wasm_v128_load(&data[i * 4]);
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
			
 
				+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24);
			
 
				+		v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
			
 
				+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		v128_t x = wasm_f32x4_convert_i32x4(xf);
			
 
				+		v128_t y = wasm_f32x4_convert_i32x4(yf);
			
 
				+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		// note: i32x4_min with 0 is equvalent to f32x4_min
			
 
				+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
			
 
				+
			
 
				+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
			
 
				+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
			
 
				+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
			
 
				+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				+
			
 
				+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
			
 
				+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
			
 
				+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
			
 
				+
			
 
				+		// combine xr/yr/zr into final value
			
 
				+		v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
			
 
				+		res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff)));
			
 
				+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8));
			
 
				+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16));
			
 
				+
			
 
				+		wasm_v128_store(&data[i * 4], res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterOctSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const v128_t sign = wasm_f32x4_splat(-0.f);
			
 
				+	const v128_t zmask = wasm_i32x4_splat(0x7fff);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]);
			
 
				+		v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]);
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16);
			
 
				+		v128_t yf = wasm_i32x4_shr(n4, 16);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we don't need to sign extend it
			
 
				+		v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
			
 
				+		v128_t zf = wasm_v128_and(z4, zmask);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		v128_t x = wasm_f32x4_convert_i32x4(xf);
			
 
				+		v128_t y = wasm_f32x4_convert_i32x4(yf);
			
 
				+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		// note: i32x4_min with 0 is equvalent to f32x4_min
			
 
				+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
			
 
				+
			
 
				+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
			
 
				+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
			
 
				+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				+
			
 
				+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
			
 
				+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
			
 
				+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
			
 
				+
			
 
				+		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
			
 
				+		v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff));
			
 
				+
			
 
				+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
 
				+		v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
			
 
				+		v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
			
 
				+
			
 
				+		// patch in .w
			
 
				+		res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
			
 
				+		res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
			
 
				+
			
 
				+		wasm_v128_store(&data[(i + 0) * 4], res_0);
			
 
				+		wasm_v128_store(&data[(i + 2) * 4], res_1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]);
			
 
				+		v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]);
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
			
 
				+		v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
			
 
				+		v128_t yf = wasm_i32x4_shr(q4_xy, 16);
			
 
				+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
			
 
				+		v128_t cf = wasm_i32x4_shr(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
			
 
				+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
			
 
				+		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
			
 
				+		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		// note: i32x4_max with 0 is equivalent to f32x4_max
			
 
				+		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
			
 
				+		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
			
 
				+
			
 
				+		v128_t s = wasm_f32x4_splat(32767.f);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				+
			
 
				+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
			
 
				+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
			
 
				+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
			
 
				+		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
			
 
				+
			
 
				+		// mix x/z and w/y to make 16-bit unpack easier
			
 
				+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
			
 
				+		v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16));
			
 
				+
			
 
				+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				+		v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
			
 
				+		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
			
 
				+
			
 
				+		// compute component index shifted left by 4 (and moved into i32x4 slot)
			
 
				+		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
			
 
				+		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
			
 
				+
			
 
				+		// rotate and store
			
 
				+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
			
 
				+
			
 
				+		out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
			
 
				+		out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
			
 
				+		out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
			
 
				+		out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExpSimd(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		v128_t v = wasm_v128_load(&data[i]);
			
 
				+
			
 
				+		// decode exponent into 2^x directly
			
 
				+		v128_t ef = wasm_i32x4_shr(v, 24);
			
 
				+		v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
			
 
				+
			
 
				+		// decode 24-bit mantissa into floating-point value
			
 
				+		v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
			
 
				+		v128_t m = wasm_f32x4_convert_i32x4(mf);
			
 
				+
			
 
				+		v128_t r = wasm_f32x4_mul(es, m);
			
 
				+
			
 
				+		wasm_v128_store(&data[i], r);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+// optimized variant of frexp
			
 
				+inline int optlog2(float v)
			
 
				+{
			
 
				+	union
			
 
				+	{
			
 
				+		float f;
			
 
				+		unsigned int ui;
			
 
				+	} u;
			
 
				+
			
 
				+	u.f = v;
			
 
				+	// +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code
			
 
				+	return v == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
			
 
				+}
			
 
				+
			
 
				+// optimized variant of ldexp
			
 
				+inline float optexp2(int e)
			
 
				+{
			
 
				+	union
			
 
				+	{
			
 
				+		float f;
			
 
				+		unsigned int ui;
			
 
				+	} u;
			
 
				+
			
 
				+	u.ui = unsigned(e + 127) << 23;
			
 
				+	return u.f;
			
 
				+}
			
 
				+
			
 
				+} // namespace meshopt
			
 
				+
			
 
				+void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride == 4 || stride == 8);
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	if (stride == 4)
			
 
				+		dispatchSimd(decodeFilterOctSimd, static_cast<signed char*>(buffer), count, 4);
			
 
				+	else
			
 
				+		dispatchSimd(decodeFilterOctSimd, static_cast<short*>(buffer), count, 4);
			
 
				+#else
			
 
				+	if (stride == 4)
			
 
				+		decodeFilterOct(static_cast<signed char*>(buffer), count);
			
 
				+	else
			
 
				+		decodeFilterOct(static_cast<short*>(buffer), count);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride == 8);
			
 
				+	(void)stride;
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	dispatchSimd(decodeFilterQuatSimd, static_cast<short*>(buffer), count, 4);
			
 
				+#else
			
 
				+	decodeFilterQuat(static_cast<short*>(buffer), count);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride > 0 && stride % 4 == 0);
			
 
				+
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+	dispatchSimd(decodeFilterExpSimd, static_cast<unsigned int*>(buffer), count * (stride / 4), 1);
			
 
				+#else
			
 
				+	decodeFilterExp(static_cast<unsigned int*>(buffer), count * (stride / 4));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
			
 
				+{
			
 
				+	assert(stride == 4 || stride == 8);
			
 
				+	assert(bits >= 1 && bits <= 16);
			
 
				+
			
 
				+	signed char* d8 = static_cast<signed char*>(destination);
			
 
				+	short* d16 = static_cast<short*>(destination);
			
 
				+
			
 
				+	int bytebits = int(stride * 2);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* n = &data[i * 4];
			
 
				+
			
 
				+		// octahedral encoding of a unit vector
			
 
				+		float nx = n[0], ny = n[1], nz = n[2], nw = n[3];
			
 
				+		float nl = fabsf(nx) + fabsf(ny) + fabsf(nz);
			
 
				+		float ns = nl == 0.f ? 0.f : 1.f / nl;
			
 
				+
			
 
				+		nx *= ns;
			
 
				+		ny *= ns;
			
 
				+
			
 
				+		float u = (nz >= 0.f) ? nx : (1 - fabsf(ny)) * (nx >= 0.f ? 1.f : -1.f);
			
 
				+		float v = (nz >= 0.f) ? ny : (1 - fabsf(nx)) * (ny >= 0.f ? 1.f : -1.f);
			
 
				+
			
 
				+		int fu = meshopt_quantizeSnorm(u, bits);
			
 
				+		int fv = meshopt_quantizeSnorm(v, bits);
			
 
				+		int fo = meshopt_quantizeSnorm(1.f, bits);
			
 
				+		int fw = meshopt_quantizeSnorm(nw, bytebits);
			
 
				+
			
 
				+		if (stride == 4)
			
 
				+		{
			
 
				+			d8[i * 4 + 0] = (signed char)(fu);
			
 
				+			d8[i * 4 + 1] = (signed char)(fv);
			
 
				+			d8[i * 4 + 2] = (signed char)(fo);
			
 
				+			d8[i * 4 + 3] = (signed char)(fw);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			d16[i * 4 + 0] = short(fu);
			
 
				+			d16[i * 4 + 1] = short(fv);
			
 
				+			d16[i * 4 + 2] = short(fo);
			
 
				+			d16[i * 4 + 3] = short(fw);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_encodeFilterQuat(void* destination_, size_t count, size_t stride, int bits, const float* data)
			
 
				+{
			
 
				+	assert(stride == 8);
			
 
				+	assert(bits >= 4 && bits <= 16);
			
 
				+	(void)stride;
			
 
				+
			
 
				+	short* destination = static_cast<short*>(destination_);
			
 
				+
			
 
				+	const float scaler = sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* q = &data[i * 4];
			
 
				+		short* d = &destination[i * 4];
			
 
				+
			
 
				+		// establish maximum quaternion component
			
 
				+		int qc = 0;
			
 
				+		qc = fabsf(q[1]) > fabsf(q[qc]) ? 1 : qc;
			
 
				+		qc = fabsf(q[2]) > fabsf(q[qc]) ? 2 : qc;
			
 
				+		qc = fabsf(q[3]) > fabsf(q[qc]) ? 3 : qc;
			
 
				+
			
 
				+		// we use double-cover properties to discard the sign
			
 
				+		float sign = q[qc] < 0.f ? -1.f : 1.f;
			
 
				+
			
 
				+		// note: we always encode a cyclical swizzle to be able to recover the order via rotation
			
 
				+		d[0] = short(meshopt_quantizeSnorm(q[(qc + 1) & 3] * scaler * sign, bits));
			
 
				+		d[1] = short(meshopt_quantizeSnorm(q[(qc + 2) & 3] * scaler * sign, bits));
			
 
				+		d[2] = short(meshopt_quantizeSnorm(q[(qc + 3) & 3] * scaler * sign, bits));
			
 
				+		d[3] = short((meshopt_quantizeSnorm(1.f, bits) & ~3) | qc);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode)
			
 
				+{
			
 
				+	using namespace meshopt;
			
 
				+
			
 
				+	assert(stride > 0 && stride % 4 == 0 && stride <= 256);
			
 
				+	assert(bits >= 1 && bits <= 24);
			
 
				+
			
 
				+	unsigned int* destination = static_cast<unsigned int*>(destination_);
			
 
				+	size_t stride_float = stride / sizeof(float);
			
 
				+
			
 
				+	int component_exp[64];
			
 
				+	assert(stride_float <= sizeof(component_exp) / sizeof(int));
			
 
				+
			
 
				+	const int min_exp = -100;
			
 
				+
			
 
				+	if (mode == meshopt_EncodeExpSharedComponent)
			
 
				+	{
			
 
				+		for (size_t j = 0; j < stride_float; ++j)
			
 
				+			component_exp[j] = min_exp;
			
 
				+
			
 
				+		for (size_t i = 0; i < count; ++i)
			
 
				+		{
			
 
				+			const float* v = &data[i * stride_float];
			
 
				+
			
 
				+			// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
			
 
				+			for (size_t j = 0; j < stride_float; ++j)
			
 
				+			{
			
 
				+				int e = optlog2(v[j]);
			
 
				+
			
 
				+				component_exp[j] = (component_exp[j] < e) ? e : component_exp[j];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (size_t i = 0; i < count; ++i)
			
 
				+	{
			
 
				+		const float* v = &data[i * stride_float];
			
 
				+		unsigned int* d = &destination[i * stride_float];
			
 
				+
			
 
				+		int vector_exp = min_exp;
			
 
				+
			
 
				+		if (mode == meshopt_EncodeExpSharedVector)
			
 
				+		{
			
 
				+			// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
			
 
				+			for (size_t j = 0; j < stride_float; ++j)
			
 
				+			{
			
 
				+				int e = optlog2(v[j]);
			
 
				+
			
 
				+				vector_exp = (vector_exp < e) ? e : vector_exp;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (mode == meshopt_EncodeExpSeparate)
			
 
				+		{
			
 
				+			for (size_t j = 0; j < stride_float; ++j)
			
 
				+			{
			
 
				+				int e = optlog2(v[j]);
			
 
				+
			
 
				+				component_exp[j] = (min_exp < e) ? e : min_exp;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (mode == meshopt_EncodeExpClamped)
			
 
				+		{
			
 
				+			for (size_t j = 0; j < stride_float; ++j)
			
 
				+			{
			
 
				+				int e = optlog2(v[j]);
			
 
				+
			
 
				+				component_exp[j] = (0 < e) ? e : 0;
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// the code below assumes component_exp is initialized outside of the loop
			
 
				+			assert(mode == meshopt_EncodeExpSharedComponent);
			
 
				+		}
			
 
				+
			
 
				+		for (size_t j = 0; j < stride_float; ++j)
			
 
				+		{
			
 
				+			int exp = (mode == meshopt_EncodeExpSharedVector) ? vector_exp : component_exp[j];
			
 
				+
			
 
				+			// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
			
 
				+			exp -= (bits - 1);
			
 
				+
			
 
				+			// compute renormalized rounded mantissa for each component
			
 
				+			int mmask = (1 << 24) - 1;
			
 
				+			int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f));
			
 
				+
			
 
				+			d[j] = (m & mmask) | (unsigned(exp) << 24);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#undef SIMD_SSE
			
 
				+#undef SIMD_NEON
			
 
				+#undef SIMD_WASM
			
--- a/include/meshoptimizer/vfetchanalyzer.cpp
+++ b/include/meshoptimizer/vfetchanalyzer.cpp
@@ -0,0 +1,58 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
			
 
				+{
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	meshopt_VertexFetchStatistics result = {};
			
 
				+
			
 
				+	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
			
 
				+	memset(vertex_visited, 0, vertex_count);
			
 
				+
			
 
				+	const size_t kCacheLine = 64;
			
 
				+	const size_t kCacheSize = 128 * 1024;
			
 
				+
			
 
				+	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
			
 
				+	size_t cache[kCacheSize / kCacheLine] = {};
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		vertex_visited[index] = 1;
			
 
				+
			
 
				+		size_t start_address = index * vertex_size;
			
 
				+		size_t end_address = start_address + vertex_size;
			
 
				+
			
 
				+		size_t start_tag = start_address / kCacheLine;
			
 
				+		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
			
 
				+
			
 
				+		assert(start_tag < end_tag);
			
 
				+
			
 
				+		for (size_t tag = start_tag; tag < end_tag; ++tag)
			
 
				+		{
			
 
				+			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
			
 
				+
			
 
				+			// we store +1 since cache is filled with 0 by default
			
 
				+			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
			
 
				+			cache[line] = tag + 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	size_t unique_vertex_count = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < vertex_count; ++i)
			
 
				+		unique_vertex_count += vertex_visited[i];
			
 
				+
			
 
				+	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
			
 
				+
			
 
				+	return result;
			
 
				+}
			
--- a/include/meshoptimizer/vfetchoptimizer.cpp
+++ b/include/meshoptimizer/vfetchoptimizer.cpp
@@ -0,0 +1,74 @@
 
				+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
			
 
				+#include "meshoptimizer.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
			
 
				+{
			
 
				+	assert(index_count % 3 == 0);
			
 
				+
			
 
				+	memset(destination, -1, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	unsigned int next_vertex = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		if (destination[index] == ~0u)
			
 
				+		{
			
 
				+			destination[index] = next_vertex++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	assert(next_vertex <= vertex_count);
			
 
				+
			
 
				+	return next_vertex;
			
 
				+}
			
 
				+
			
 
				+size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
			
 
				+{
			
 
				+	assert(index_count % 3 == 0);
			
 
				+	assert(vertex_size > 0 && vertex_size <= 256);
			
 
				+
			
 
				+	meshopt_Allocator allocator;
			
 
				+
			
 
				+	// support in-place optimization
			
 
				+	if (destination == vertices)
			
 
				+	{
			
 
				+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
			
 
				+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
			
 
				+		vertices = vertices_copy;
			
 
				+	}
			
 
				+
			
 
				+	// build vertex remap table
			
 
				+	unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count);
			
 
				+	memset(vertex_remap, -1, vertex_count * sizeof(unsigned int));
			
 
				+
			
 
				+	unsigned int next_vertex = 0;
			
 
				+
			
 
				+	for (size_t i = 0; i < index_count; ++i)
			
 
				+	{
			
 
				+		unsigned int index = indices[i];
			
 
				+		assert(index < vertex_count);
			
 
				+
			
 
				+		unsigned int& remap = vertex_remap[index];
			
 
				+
			
 
				+		if (remap == ~0u) // vertex was not added to destination VB
			
 
				+		{
			
 
				+			// add vertex
			
 
				+			memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size);
			
 
				+
			
 
				+			remap = next_vertex++;
			
 
				+		}
			
 
				+
			
 
				+		// modify indices in place
			
 
				+		indices[i] = remap;
			
 
				+	}
			
 
				+
			
 
				+	assert(next_vertex <= vertex_count);
			
 
				+
			
 
				+	return next_vertex;
			
 
				+}
			
--- a/libs/fmt/fmt.vcxproj
+++ b/libs/fmt/fmt.vcxproj
@@ -27,6 +27,22 @@
 
				     </ProjectConfiguration>

			
 
				   </ItemGroup>

			
 
				   <ItemGroup>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\allocator.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\clusterizer.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\indexcodec.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\indexgenerator.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\overdrawanalyzer.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\overdrawoptimizer.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\quantization.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\simplifier.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\spatialorder.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\stripifier.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vcacheanalyzer.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vcacheoptimizer.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vertexcodec.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vertexfilter.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vfetchanalyzer.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vfetchoptimizer.cpp" />

			
 
				     <ClCompile Include="..\..\include\mikktspace\mikktspace.c" />

			
 
				     <ClCompile Include="..\..\include\png\png.c" />

			
 
				     <ClCompile Include="..\..\include\png\pngerror.c" />

			
@@ -129,10 +145,12 @@
 
				     <ClCompile Include="..\..\include\zlib\zutil.c" />

			
 
				     <ClCompile Include="dxt.c" />

			
 
				     <ClCompile Include="fmt.c" />

			
 
				+    <ClCompile Include="meshoptimizer.cpp" />

			
 
				     <ClCompile Include="mikkt.c" />

			
 
				     <ClCompile Include="sha1.c" />

			
 
				   </ItemGroup>

			
 
				   <ItemGroup>

			
 
				+    <ClInclude Include="..\..\include\meshoptimizer\meshoptimizer.h" />

			
 
				     <ClInclude Include="..\..\include\mikktspace\mikktspace.h" />

			
 
				     <ClInclude Include="..\..\include\png\png.h" />

			
 
				     <ClInclude Include="..\..\include\png\pngconf.h" />

			
@@ -247,37 +265,37 @@
 
				   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">

			
 
				     <LinkIncremental>true</LinkIncremental>

			
 
				     <TargetExt>.hdll</TargetExt>

			
 
				-    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3</IncludePath>

			
 
				+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3;../../include/meshoptimizer</IncludePath>

			
 
				     <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;../../$(Configuration);../../include/turbojpeg/x86</LibraryPath>

			
 
				   </PropertyGroup>

			
 
				   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">

			
 
				     <LinkIncremental>true</LinkIncremental>

			
 
				     <TargetExt>.hdll</TargetExt>

			
 
				-    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3</IncludePath>

			
 
				+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3;../../include/meshoptimizer</IncludePath>

			
 
				     <LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64;../../x64/$(Configuration);../../include/turbojpeg/x64</LibraryPath>

			
 
				   </PropertyGroup>

			
 
				   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">

			
 
				     <LinkIncremental>false</LinkIncremental>

			
 
				     <TargetExt>.hdll</TargetExt>

			
 
				-    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3</IncludePath>

			
 
				+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3;../../include/meshoptimizer</IncludePath>

			
 
				     <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;../../$(Configuration);../../include/turbojpeg/x86</LibraryPath>

			
 
				   </PropertyGroup>

			
 
				   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseVS2013|Win32'">

			
 
				     <LinkIncremental>false</LinkIncremental>

			
 
				     <TargetExt>.hdll</TargetExt>

			
 
				-    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3</IncludePath>

			
 
				+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3;../../include/meshoptimizer</IncludePath>

			
 
				     <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;../../$(Configuration);../../include/turbojpeg/x86</LibraryPath>

			
 
				   </PropertyGroup>

			
 
				   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">

			
 
				     <LinkIncremental>false</LinkIncremental>

			
 
				     <TargetExt>.hdll</TargetExt>

			
 
				-    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3</IncludePath>

			
 
				+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3;../../include/meshoptimizer</IncludePath>

			
 
				     <LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64;../../x64/$(Configuration);../../include/turbojpeg/x64</LibraryPath>

			
 
				   </PropertyGroup>

			
 
				   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseVS2013|x64'">

			
 
				     <LinkIncremental>false</LinkIncremental>

			
 
				     <TargetExt>.hdll</TargetExt>

			
 
				-    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3</IncludePath>

			
 
				+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);../../src;../../include/turbojpeg;../../include/zlib;../../include/png;../../include/vorbis;../../include/mikktspace;../../include/minimp3;../../include/meshoptimizer</IncludePath>

			
 
				     <LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64;../../x64/$(Configuration);../../include/turbojpeg/x64</LibraryPath>

			
 
				   </PropertyGroup>

			
 
				   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">

			
--- a/libs/fmt/fmt.vcxproj.filters
+++ b/libs/fmt/fmt.vcxproj.filters
@@ -305,6 +305,55 @@
 
				       <Filter>mikkt</Filter>

			
 
				     </ClCompile>

			
 
				     <ClCompile Include="dxt.c" />

			
 
				+    <ClCompile Include="meshoptimizer.cpp" />

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\allocator.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\clusterizer.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\indexcodec.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\indexgenerator.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\overdrawanalyzer.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\overdrawoptimizer.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\quantization.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\simplifier.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\spatialorder.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\stripifier.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vcacheanalyzer.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vcacheoptimizer.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vertexcodec.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vertexfilter.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vfetchanalyzer.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				+    <ClCompile Include="..\..\include\meshoptimizer\vfetchoptimizer.cpp">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClCompile>

			
 
				   </ItemGroup>

			
 
				   <ItemGroup>

			
 
				     <Filter Include="turbojpeg">

			
@@ -325,6 +374,9 @@
 
				     <Filter Include="minimp3">

			
 
				       <UniqueIdentifier>{1bfe5a7d-6015-497d-936d-1a92e8938ca7}</UniqueIdentifier>

			
 
				     </Filter>

			
 
				+    <Filter Include="meshoptimizer">

			
 
				+      <UniqueIdentifier>{6308414b-7320-4672-b033-ebcea25344d3}</UniqueIdentifier>

			
 
				+    </Filter>

			
 
				   </ItemGroup>

			
 
				   <ItemGroup>

			
 
				     <ClInclude Include="..\..\include\zlib\zconf.h">

			
@@ -439,5 +491,8 @@
 
				     <ClInclude Include="..\..\include\minimp3\minimp3.h">

			
 
				       <Filter>minimp3</Filter>

			
 
				     </ClInclude>

			
 
				+    <ClInclude Include="..\..\include\meshoptimizer\meshoptimizer.h">

			
 
				+      <Filter>meshoptimizer</Filter>

			
 
				+    </ClInclude>

			
 
				   </ItemGroup>

			
 
				 </Project>
			
--- a/libs/fmt/meshoptimizer.cpp
+++ b/libs/fmt/meshoptimizer.cpp
@@ -0,0 +1,39 @@
 
				+#define HL_NAME(n) fmt_##n
			
 
				+#include <meshoptimizer.h>
			
 
				+#include <hl.h>
			
 
				+
			
 
				+HL_PRIM int HL_NAME(generate_vertex_remap)(unsigned int* pRemapOut, unsigned int* pIndices, int indexCount, float* pVertices, int vertexCount, int vertexSize) {
			
 
				+    return meshopt_generateVertexRemap(pRemapOut, pIndices, indexCount, pVertices, vertexCount, vertexSize);
			
 
				+}
			
 
				+
			
 
				+HL_PRIM void HL_NAME(remap_index_buffer)(unsigned int* pIndicesOut, unsigned int* pIndicesIn, int indexCount, unsigned int* pRemap) {
			
 
				+    meshopt_remapIndexBuffer(pIndicesOut, pIndicesIn, indexCount, pRemap);
			
 
				+}
			
 
				+
			
 
				+HL_PRIM void HL_NAME(remap_vertex_buffer)(void* pVerticesOut, void* pVerticexIn, int vertexCount, int vertexSize, unsigned int* pRemap) {
			
 
				+    meshopt_remapVertexBuffer(pVerticesOut, pVerticexIn, vertexCount, vertexSize, pRemap);
			
 
				+}
			
 
				+
			
 
				+HL_PRIM int HL_NAME(simplify)(unsigned int* pIndicesOut, unsigned int* pIndicesIn, int indexCount, float* pVertices, int vertexCount, int vertexSize, int targetIndexCount, float targetError, int options, float* resultErrorOut) {
			
 
				+    return meshopt_simplify(pIndicesOut, pIndicesIn, indexCount, pVertices, vertexCount, vertexSize, targetIndexCount, (float)targetError, options, resultErrorOut);
			
 
				+}
			
 
				+
			
 
				+HL_PRIM void HL_NAME(optimize_vertex_cache)(unsigned int* pIndicesOut, unsigned int* pIndicesIn, int indexCount, int vertexCount) {
			
 
				+    meshopt_optimizeVertexCache(pIndicesOut, pIndicesIn, indexCount, vertexCount);
			
 
				+}
			
 
				+
			
 
				+HL_PRIM void HL_NAME(optimize_overdraw)(unsigned int* pIndicesOut, unsigned int* pIndicesIn, int indexCount, float* pVertices, int vertexCount, int vertexSize, float threshold) {
			
 
				+    meshopt_optimizeOverdraw(pIndicesOut, pIndicesIn, indexCount, pVertices, vertexCount, vertexSize, threshold);
			
 
				+}
			
 
				+
			
 
				+HL_PRIM int HL_NAME(optimize_vertex_fetch)(float* pVerticesOut, unsigned int* pIndices, int indexCount, void* pVerticesIn, int vertexCount, int vertexSize) {
			
 
				+    return meshopt_optimizeVertexFetch(pVerticesOut, pIndices, indexCount, pVerticesIn, vertexCount, vertexSize);
			
 
				+}
			
 
				+
			
 
				+DEFINE_PRIM(_I32, generate_vertex_remap, _BYTES _BYTES _I32 _BYTES _I32 _I32);
			
 
				+DEFINE_PRIM(_VOID, remap_index_buffer, _BYTES _BYTES _I32 _BYTES);
			
 
				+DEFINE_PRIM(_VOID, remap_vertex_buffer, _BYTES _BYTES _I32 _I32 _BYTES);
			
 
				+DEFINE_PRIM(_I32, simplify, _BYTES _BYTES _I32 _BYTES _I32 _I32 _I32 _F32 _I32 _BYTES);
			
 
				+DEFINE_PRIM(_VOID, optimize_vertex_cache, _BYTES _BYTES _I32 _I32);
			
 
				+DEFINE_PRIM(_VOID, optimize_overdraw, _BYTES _BYTES _I32 _BYTES _I32 _I32 _F32);
			
 
				+DEFINE_PRIM(_I32, optimize_vertex_fetch, _BYTES _BYTES _I32 _BYTES _I32 _I32);