3 weeks ago · c35f437910
--- a/3rdparty/meshoptimizer/src/clusterizer.cpp
+++ b/3rdparty/meshoptimizer/src/clusterizer.cpp
@@ -640,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
 
				 	return offset + count;
			
 
				 }
			
 
				 
			
 
				-static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
			
 
				+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
			
 
				 {
			
 
				 	assert(count > 0);
			
 
				 	assert(offset < node_count);
			
@@ -672,7 +672,8 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
 
				 	size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
			
 
				 
			
 
				 	// when the partition is degenerate simply consolidate the points into a single node
			
 
				-	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
			
 
				+	// this also ensures recursion depth is bounded on pathological inputs
			
 
				+	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
			
 
				 		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
			
 
				 
			
 
				 	KDNode& result = nodes[offset];
			
@@ -681,13 +682,13 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
 
				 	result.axis = axis;
			
 
				 
			
 
				 	// left subtree is right after our node
			
 
				-	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
			
 
				+	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
			
 
				 
			
 
				 	// distance to the right subtree is represented explicitly
			
 
				 	assert(next_offset - offset > 1);
			
 
				 	result.children = unsigned(next_offset - offset - 1);
			
 
				 
			
 
				-	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
			
 
				+	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
			
 
				 }
			
 
				 
			
 
				 static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
			
@@ -739,6 +740,7 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 
				 		if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
			
 
				 			nodes[root].children = 0;
			
 
				 
			
 
				+		// recursion depth is bounded by tree depth (which is limited by construction)
			
 
				 		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
			
 
				 
			
 
				 		// only process the other node if it can have a match based on closest distance so far
			
@@ -765,6 +767,7 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other)
 
				 	__m128 min = _mm_loadu_ps(box.min);
			
 
				 	__m128 max = _mm_loadu_ps(box.max);
			
 
				 
			
 
				+	// note: over-read is safe because BVHBox array is allocated with padding
			
 
				 	min = _mm_min_ps(min, _mm_loadu_ps(other.min));
			
 
				 	max = _mm_max_ps(max, _mm_loadu_ps(other.max));
			
 
				 
			
@@ -785,6 +788,7 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other)
 
				 	float32x4_t min = vld1q_f32(box.min);
			
 
				 	float32x4_t max = vld1q_f32(box.max);
			
 
				 
			
 
				+	// note: over-read is safe because BVHBox array is allocated with padding
			
 
				 	min = vminq_f32(min, vld1q_f32(other.min));
			
 
				 	max = vmaxq_f32(max, vld1q_f32(other.max));
			
 
				 
			
@@ -1046,9 +1050,6 @@ static void bvhPartition(unsigned int* target, const unsigned int* order, const
 
				 
			
 
				 static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
			
 
				 {
			
 
				-	if (depth >= kMeshletMaxTreeDepth)
			
 
				-		return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
			
 
				-
			
 
				 	if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
			
 
				 		return bvhPackLeaf(boundary, count);
			
 
				 
			
@@ -1091,8 +1092,8 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	// this may happen if SAH costs along the admissible splits are NaN
			
 
				-	if (bestk < 0)
			
 
				+	// this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
			
 
				+	if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
			
 
				 		return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
			
 
				 
			
 
				 	// mark sides of split for partitioning
			
@@ -1117,6 +1118,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
 
				 		bvhPartition(axis, temp, sides, bestsplit, count);
			
 
				 	}
			
 
				 
			
 
				+	// recursion depth is bounded due to max depth check above
			
 
				 	bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
			
 
				 	bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
			
 
				 }
			
@@ -1191,7 +1193,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
 
				 		kdindices[i] = unsigned(i);
			
 
				 
			
 
				 	KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
			
 
				-	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
			
 
				+	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
			
 
				 
			
 
				 	// find a specific corner of the mesh to use as a starting point for meshlet flow
			
 
				 	float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
			
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -360,13 +360,13 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s
 
				  * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
			
 
				  * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
			
 
				  *
			
 
				- * Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
			
 
				+ * meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
			
 
				  * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
			
 
				 MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
			
 
				 MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
			
 
				-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
			
 
				+MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
			
 
				 
			
 
				 /**
			
 
				  * Vertex buffer filter encoders
			
@@ -384,7 +384,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t c
 
				  * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
			
 
				  * Input data must contain stride/4 floats for every vector (count*stride/4 total).
			
 
				  *
			
 
				- * Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
			
 
				+ * meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
			
 
				  * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
			
 
				  * Input data must contain 4 floats for every color (count*4 total).
			
 
				  */
			
@@ -403,7 +403,7 @@ enum meshopt_EncodeExpMode
 
				 MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				 MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				 MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
			
 
				-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				+MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
			
 
				 
			
 
				 /**
			
 
				  * Simplification options
			
@@ -478,7 +478,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
 
				 MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				 
			
 
				 /**
			
 
				- * Experimental: Mesh simplifier with position/attribute update
			
 
				+ * Mesh simplifier with position/attribute update
			
 
				  * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
			
 
				  * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
			
 
				  * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
			
@@ -498,7 +498,7 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio
 
				  * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
			
 
				  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				  */
			
 
				-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				+MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
			
 
				 
			
 
				 /**
			
 
				  * Mesh simplifier (sloppy)
			
@@ -699,10 +699,9 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* me
 
				 
			
 
				 /**
			
 
				  * Meshlet optimizer
			
 
				- * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
			
 
				+ * Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
			
 
				  *
			
 
				- * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
			
 
				- * need to be computed from meshlet's vertex_offset and triangle_offset
			
 
				+ * meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
			
 
				  * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
			
--- a/3rdparty/meshoptimizer/src/partition.cpp
+++ b/3rdparty/meshoptimizer/src/partition.cpp
@@ -10,6 +10,9 @@
 
				 namespace meshopt
			
 
				 {
			
 
				 
			
 
				+// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
			
 
				+const int kMergeDepthCutoff = 40;
			
 
				+
			
 
				 struct ClusterAdjacency
			
 
				 {
			
 
				 	unsigned int* offsets;
			
@@ -434,7 +437,7 @@ static size_t mergePartition(unsigned int* order, size_t count, const ClusterGro
 
				 	return m;
			
 
				 }
			
 
				 
			
 
				-static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size)
			
 
				+static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
			
 
				 {
			
 
				 	size_t total = 0;
			
 
				 	for (size_t i = 0; i < count; ++i)
			
@@ -467,11 +470,13 @@ static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count
 
				 	size_t middle = mergePartition(order, count, groups, axis, split);
			
 
				 
			
 
				 	// enforce balance for degenerate partitions
			
 
				-	if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2)
			
 
				+	// this also ensures recursion depth is bounded on pathological inputs
			
 
				+	if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
			
 
				 		middle = count / 2;
			
 
				 
			
 
				-	mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size);
			
 
				-	mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size);
			
 
				+	// recursion depth is logarithmic and bounded due to max depth check above
			
 
				+	mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
			
 
				+	mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
			
 
				 }
			
 
				 
			
 
				 } // namespace meshopt
			
@@ -597,7 +602,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
 
				 			if (groups[i].size)
			
 
				 				merge_order[merge_offset++] = unsigned(i);
			
 
				 
			
 
				-		mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8);
			
 
				+		mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
			
 
				 	}
			
 
				 
			
 
				 	// output each remaining group
			
--- a/3rdparty/meshoptimizer/src/simplifier.cpp
+++ b/3rdparty/meshoptimizer/src/simplifier.cpp
@@ -620,7 +620,7 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_update)
			
 
				+static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock)
			
 
				 {
			
 
				 	size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float);
			
 
				 	size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
			
@@ -632,12 +632,20 @@ static void finalizeVertices(float* vertex_positions_data, size_t vertex_positio
 
				 
			
 
				 		unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
			
 
				 
			
 
				-		const Vector3& p = vertex_positions[i];
			
 
				-		float* v = vertex_positions_data + ri * vertex_positions_stride_float;
			
 
				+		// updating externally locked vertices is not allowed
			
 
				+		if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0)
			
 
				+			continue;
			
 
				+
			
 
				+		// moving locked vertices may result in floating point drift
			
 
				+		if (vertex_kind[i] != Kind_Locked)
			
 
				+		{
			
 
				+			const Vector3& p = vertex_positions[i];
			
 
				+			float* v = vertex_positions_data + ri * vertex_positions_stride_float;
			
 
				 
			
 
				-		v[0] = p.x * vertex_scale + vertex_offset[0];
			
 
				-		v[1] = p.y * vertex_scale + vertex_offset[1];
			
 
				-		v[2] = p.z * vertex_scale + vertex_offset[2];
			
 
				+			v[0] = p.x * vertex_scale + vertex_offset[0];
			
 
				+			v[1] = p.y * vertex_scale + vertex_offset[1];
			
 
				+			v[2] = p.z * vertex_scale + vertex_offset[2];
			
 
				+		}
			
 
				 
			
 
				 		if (attribute_count)
			
 
				 		{
			
@@ -1637,10 +1645,10 @@ static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_cou
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
			
 
				+static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
			
 
				 {
			
 
				 #if TRACE
			
 
				-	size_t stats[5] = {};
			
 
				+	size_t stats[6] = {};
			
 
				 #endif
			
 
				 
			
 
				 	for (size_t i = 0; i < vertex_count; ++i)
			
@@ -1648,7 +1656,6 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s
 
				 		if (!vertex_update[i])
			
 
				 			continue;
			
 
				 
			
 
				-		// moving externally locked vertices is prohibited
			
 
				 		// moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds
			
 
				 		// moving vertices on a border requires a stronger edge quadric to preserve the border geometry
			
 
				 		if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border)
			
@@ -1712,36 +1719,64 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				+		// reject updates that increase positional error too much; allow some tolerance to improve attribute quality
			
 
				+		if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f)
			
 
				+		{
			
 
				+			TRACESTATS(5);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				 		TRACESTATS(1);
			
 
				 		vertex_positions[i] = p;
			
 
				 	}
			
 
				 
			
 
				 #if TRACE
			
 
				-	printf("updated %d/%d positions; failed solve %d bounds %d flip %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]));
			
 
				+	printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5]));
			
 
				 #endif
			
 
				+}
			
 
				 
			
 
				-	if (attribute_count == 0)
			
 
				-		return;
			
 
				-
			
 
				+static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update)
			
 
				+{
			
 
				 	for (size_t i = 0; i < vertex_count; ++i)
			
 
				 	{
			
 
				 		if (!vertex_update[i])
			
 
				 			continue;
			
 
				 
			
 
				-		// updating externally locked vertices is prohibited
			
 
				-		if (vertex_kind[i] == Kind_Locked)
			
 
				+		if (remap[i] != i)
			
 
				 			continue;
			
 
				 
			
 
				-		const Vector3& p = vertex_positions[remap[i]];
			
 
				-		const Quadric& A = attribute_quadrics[i];
			
 
				-
			
 
				-		float iw = A.w == 0 ? 0.f : 1.f / A.w;
			
 
				-
			
 
				 		for (size_t k = 0; k < attribute_count; ++k)
			
 
				 		{
			
 
				-			const QuadricGrad& G = attribute_gradients[i * attribute_count + k];
			
 
				+			unsigned int shared = ~0u;
			
 
				+
			
 
				+			// for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared
			
 
				+			if (vertex_kind[i] == Kind_Complex)
			
 
				+			{
			
 
				+				shared = unsigned(i);
			
 
				+
			
 
				+				for (unsigned int v = wedge[i]; v != i; v = wedge[v])
			
 
				+					if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k])
			
 
				+						shared = ~0u;
			
 
				+					else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w)
			
 
				+						shared = v;
			
 
				+			}
			
 
				 
			
 
				-			vertex_attributes[i * attribute_count + k] = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
			
 
				+			// update attributes for all wedges
			
 
				+			unsigned int v = unsigned(i);
			
 
				+			do
			
 
				+			{
			
 
				+				unsigned int r = (shared == ~0u) ? v : shared;
			
 
				+
			
 
				+				const Vector3& p = vertex_positions[i]; // same for all wedges
			
 
				+				const Quadric& A = attribute_quadrics[r];
			
 
				+				const QuadricGrad& G = attribute_gradients[r * attribute_count + k];
			
 
				+
			
 
				+				float iw = A.w == 0 ? 0.f : 1.f / A.w;
			
 
				+				float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
			
 
				+
			
 
				+				vertex_attributes[v * attribute_count + k] = av;
			
 
				+				v = wedge[v];
			
 
				+			} while (v != i);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -2522,16 +2557,19 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
				 		{
			
 
				 			unsigned int v = result[i];
			
 
				 
			
 
				-			// recomputing externally locked vertices may result in floating point drift
			
 
				-			vertex_update[v] = vertex_kind[v] != Kind_Locked;
			
 
				+			// mark the vertex for finalizeVertices and root vertex for solve*
			
 
				+			vertex_update[remap[v]] = vertex_update[v] = 1;
			
 
				 		}
			
 
				 
			
 
				 		// edge adjacency may be stale as we haven't updated it after last series of edge collapses
			
 
				 		updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
			
 
				 
			
 
				-		solveQuadrics(vertex_positions, vertex_attributes, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
			
 
				+		solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
			
 
				+
			
 
				+		if (attribute_count)
			
 
				+			solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update);
			
 
				 
			
 
				-		finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_update);
			
 
				+		finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock);
			
 
				 	}
			
 
				 
			
 
				 	// if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
			
--- a/3rdparty/meshoptimizer/src/spatialorder.cpp
+++ b/3rdparty/meshoptimizer/src/spatialorder.cpp
@@ -208,6 +208,7 @@ static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigne
 
				 		partitionPoints(axis, temp, sides, split, count);
			
 
				 	}
			
 
				 
			
 
				+	// recursion depth is logarithmic and bounded as we always split in approximately half
			
 
				 	splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
			
 
				 	splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
			
 
				 }
			
--- a/3rdparty/meshoptimizer/src/vertexfilter.cpp
+++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp
@@ -550,6 +550,13 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
 
				 	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
			
 
				 	return vmulq_f32(x, r);
			
 
				 }
			
 
				+
			
 
				+#ifndef __ARM_FEATURE_FMA
			
 
				+inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z)
			
 
				+{
			
 
				+	return vaddq_f32(x, vmulq_f32(y, z));
			
 
				+}
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 #ifdef SIMD_NEON
			
@@ -580,23 +587,21 @@ static void decodeFilterOctSimd8(signed char* data, size_t count)
 
				 		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
			
 
				 
			
 
				 		// compute normal length & scale
			
 
				-		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
			
 
				+		float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
			
 
				 		float32x4_t rl = vrsqrteq_f32(ll);
			
 
				 		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
			
 
				 
			
 
				 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				-		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
			
 
				 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				 
			
 
				-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
			
 
				 
			
 
				 		// combine xr/yr/zr into final value
			
 
				-		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
			
 
				-		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
			
 
				-		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
			
 
				-		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
			
 
				+		int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8);
			
 
				+		res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res);
			
 
				 
			
 
				 		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
			
 
				 	}
			
@@ -634,21 +639,25 @@ static void decodeFilterOctSimd16(short* data, size_t count)
 
				 		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
			
 
				 
			
 
				 		// compute normal length & scale
			
 
				-		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
			
 
				+		float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
			
 
				+#if !defined(__aarch64__) && !defined(_M_ARM64)
			
 
				 		float32x4_t rl = vrsqrteq_f32(ll);
			
 
				 		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
			
 
				 		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
			
 
				+#else
			
 
				+		float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll));
			
 
				+#endif
			
 
				 
			
 
				 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				 
			
 
				-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
			
 
				 
			
 
				 		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				-		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
			
 
				+		int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
			
 
				 		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
			
 
				 
			
 
				 		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
@@ -694,7 +703,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
				 
			
 
				 		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
			
 
				 		float32x4_t ws = vmulq_f32(s, s);
			
 
				-		float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
			
 
				+		float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z));
			
 
				 		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
			
 
				 
			
 
				 		// compute final scale; note that all computations above are unscaled
			
@@ -705,26 +714,32 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
				 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				 
			
 
				-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, ss), fsnap));
			
 
				-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, ss), fsnap));
			
 
				-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, ss), fsnap));
			
 
				-		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, ss), fsnap));
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss));
			
 
				+		int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss));
			
 
				 
			
 
				 		// mix x/z and w/y to make 16-bit unpack easier
			
 
				-		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
			
 
				-		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
			
 
				+		int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
			
 
				+		int32x4_t wyr = vsliq_n_s32(wr, yr, 16);
			
 
				 
			
 
				 		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				-		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
			
 
				-		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
			
 
				+		uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
			
 
				+		uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
			
 
				+
			
 
				+		// store results to stack so that we can rotate using scalar instructions
			
 
				+		// TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808
			
 
				+		volatile uint64_t res[4];
			
 
				+		vst1q_u64(const_cast<uint64_t*>(&res[0]), res_0);
			
 
				+		vst1q_u64(const_cast<uint64_t*>(&res[2]), res_1);
			
 
				 
			
 
				 		// rotate and store
			
 
				-		uint64_t* out = (uint64_t*)&data[i * 4];
			
 
				+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
			
 
				 
			
 
				-		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
			
 
				-		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
			
 
				-		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
			
 
				-		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
			
 
				+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
			
 
				+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
			
 
				+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
			
 
				+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -778,19 +793,16 @@ static void decodeFilterColorSimd8(unsigned char* data, size_t count)
 
				 		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
			
 
				 
			
 
				 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				-		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
			
 
				 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				 
			
 
				-		int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
			
 
				-		int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
			
 
				-		int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
			
 
				-		int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
			
 
				+		int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
			
 
				+		int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
			
 
				+		int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
			
 
				+		int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
			
 
				 
			
 
				 		// repack rgba into final value
			
 
				-		int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff));
			
 
				-		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8));
			
 
				-		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16));
			
 
				-		res = vorrq_s32(res, vshlq_n_s32(ar, 24));
			
 
				+		int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8);
			
 
				 
			
 
				 		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
			
 
				 	}
			
@@ -835,14 +847,14 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count)
 
				 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				 
			
 
				-		int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
			
 
				-		int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
			
 
				-		int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
			
 
				-		int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
			
 
				+		int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
			
 
				+		int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
			
 
				+		int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
			
 
				+		int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
			
 
				 
			
 
				 		// mix r/b and g/a to make 16-bit unpack easier
			
 
				-		int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16));
			
 
				-		int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16));
			
 
				+		int32x4_t rbr = vsliq_n_s32(rr, br, 16);
			
 
				+		int32x4_t gar = vsliq_n_s32(gr, ar, 16);
			
 
				 
			
 
				 		// pack r/g/b/a using 16-bit unpacks
			
 
				 		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
			
@@ -1145,7 +1157,7 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count)
 
				 		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
			
 
				 
			
 
				 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				-		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				 		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
			
 
				 
			
 
				 		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);