3 settimane fa · 3dbb193ece
--- a/scene/resources/surface_tool.h
+++ b/scene/resources/surface_tool.h
@@ -89,6 +89,10 @@ public:
 
															 		SIMPLIFY_ERROR_ABSOLUTE = 1 << 2, // From meshopt_SimplifyErrorAbsolute
														
 
															 		/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
														
 
															 		SIMPLIFY_PRUNE = 1 << 3, // From meshopt_SimplifyPrune
														
 
															+		/* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */
														
 
															+		SIMPLIFY_REGULARIZE = 1 << 4, // From meshopt_SimplifyRegularize
														
 
															+		/* Allow collapses across attribute discontinuities, except for vertices that are tagged with 0x02 in vertex_lock. */
														
 
															+		SIMPLIFY_PERMISSIVE = 1 << 5, // From meshopt_SimplifyPermissive
														
 
															 	};
														
 
															 	typedef void (*OptimizeVertexCacheFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, size_t vertex_count);
														
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -679,7 +679,7 @@ Patches:
 
															 ## meshoptimizer
														
 
															 - Upstream: https://github.com/zeux/meshoptimizer
														
 
															-- Version: 0.24 (7b2d4f4c817aea55d74dcd65d9763ac2ca608026, 2025)
														
 
															+- Version: 0.25 (6daea4695c48338363b08022d2fb15deaef6ac09, 2025)
														
 
															 - License: MIT
														
 
															 Files extracted from upstream repository:
														
--- a/thirdparty/meshoptimizer/allocator.cpp
+++ b/thirdparty/meshoptimizer/allocator.cpp
@@ -1,8 +1,17 @@
 
															 // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
														
 
															 #include "meshoptimizer.h"
														
 
															+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
														
 
															+meshopt_Allocator::Storage& meshopt_Allocator::storage()
														
 
															+{
														
 
															+	static Storage s = {::operator new, ::operator delete };
														
 
															+	return s;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
														
 
															 {
														
 
															-	meshopt_Allocator::Storage::allocate = allocate;
														
 
															-	meshopt_Allocator::Storage::deallocate = deallocate;
														
 
															+	meshopt_Allocator::Storage& s = meshopt_Allocator::storage();
														
 
															+	s.allocate = allocate;
														
 
															+	s.deallocate = deallocate;
														
 
															 }
														
--- a/thirdparty/meshoptimizer/indexgenerator.cpp
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -439,6 +439,31 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
 
															 	generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
														
 
															 }
														
 
															+void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
														
 
															+{
														
 
															+	using namespace meshopt;
														
 
															+
														
 
															+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
														
 
															+	assert(vertex_positions_stride % sizeof(float) == 0);
														
 
															+
														
 
															+	meshopt_Allocator allocator;
														
 
															+	VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL};
														
 
															+
														
 
															+	size_t table_size = hashBuckets(vertex_count);
														
 
															+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
														
 
															+	memset(table, -1, table_size * sizeof(unsigned int));
														
 
															+
														
 
															+	for (size_t i = 0; i < vertex_count; ++i)
														
 
															+	{
														
 
															+		unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u);
														
 
															+
														
 
															+		if (*entry == ~0u)
														
 
															+			*entry = unsigned(i);
														
 
															+
														
 
															+		destination[i] = *entry;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
														
 
															 {
														
 
															 	using namespace meshopt;
														
--- a/thirdparty/meshoptimizer/meshoptimizer.h
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * meshoptimizer - version 0.24
														
 
															+ * meshoptimizer - version 0.25
														
 
															  *
														
 
															  * Copyright (C) 2016-2025, by Arseny Kapoulkine ([email protected])
														
 
															  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
														
@@ -12,7 +12,7 @@
 
															 #include <stddef.h>
														
 
															 /* Version macro; major * 1000 + minor * 10 + patch */
														
 
															-#define MESHOPTIMIZER_VERSION 240 /* 0.24 */
														
 
															+#define MESHOPTIMIZER_VERSION 250 /* 0.25 */
														
 
															 /* If no API is defined, assume default */
														
 
															 #ifndef MESHOPTIMIZER_API
														
@@ -75,7 +75,7 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
 
															 MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
														
 
															 /**
														
 
															- * Experimental: Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
														
 
															+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
														
 
															  * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence.
														
 
															  * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided).
														
 
															  * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
														
@@ -85,7 +85,7 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destinat
 
															  * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
 
															  * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not
														
 
															  */
														
 
															-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
														
 
															+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
														
 
															 /**
														
 
															  * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
														
@@ -124,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 
															  */
														
 
															 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
														
 
															+/**
														
 
															+ * Experimental: Generates a remap table that maps all vertices with the same position to the same (existing) index.
														
 
															+ * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
														
 
															+ * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
														
 
															+ *
														
 
															+ * destination must contain enough space for the resulting remap table (vertex_count elements)
														
 
															+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
 
															+ */
														
 
															+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
														
 
															+
														
 
															 /**
														
 
															  * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
														
 
															  * Each triangle is converted into a 6-vertex patch with the following layout:
														
@@ -155,7 +165,7 @@ MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* des
 
															 /**
														
 
															  * Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
														
 
															- * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using nointerpolate attribute.
														
 
															+ * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute.
														
 
															  * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader.
														
 
															  * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data.
														
 
															  * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering.
														
@@ -298,7 +308,7 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_
 
															 MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
														
 
															 /**
														
 
															- * Experimental: Vertex buffer encoder
														
 
															+ * Vertex buffer encoder
														
 
															  * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
														
 
															  * For compression level to take effect, the vertex encoding version must be set to 1.
														
 
															  * The default compression level implied by meshopt_encodeVertexBuffer is 2.
														
@@ -306,7 +316,7 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, si
 
															  * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
														
 
															  * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0.
														
 
															  */
														
 
															-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
														
 
															+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
														
 
															 /**
														
 
															  * Set vertex encoder format version
														
@@ -343,10 +353,14 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s
 
															  *
														
 
															  * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
														
 
															  * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
														
 
															+ *
														
 
															+ * Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
														
 
															+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
														
 
															  */
														
 
															 MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
														
 
															 MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
														
 
															 MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
														
 
															+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
														
 
															 /**
														
 
															  * Vertex buffer filter encoders
														
@@ -363,6 +377,10 @@ MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_
 
															  * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
														
 
															  * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
														
 
															  * Input data must contain stride/4 floats for every vector (count*stride/4 total).
														
 
															+ *
														
 
															+ * Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
														
 
															+ * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
														
 
															+ * Input data must contain 4 floats for every color (count*4 total).
														
 
															  */
														
 
															 enum meshopt_EncodeExpMode
														
 
															 {
														
@@ -379,6 +397,7 @@ enum meshopt_EncodeExpMode
 
															 MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
														
 
															 MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
														
 
															 MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
														
 
															+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
														
 
															 /**
														
 
															  * Simplification options
														
@@ -391,18 +410,34 @@ enum
 
															 	meshopt_SimplifySparse = 1 << 1,
														
 
															 	/* Treat error limit and resulting error as absolute instead of relative to mesh extents. */
														
 
															 	meshopt_SimplifyErrorAbsolute = 1 << 2,
														
 
															-	/* Experimental: remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
														
 
															+	/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
														
 
															 	meshopt_SimplifyPrune = 1 << 3,
														
 
															+	/* Experimental: Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */
														
 
															+	meshopt_SimplifyRegularize = 1 << 4,
														
 
															+	/* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
														
 
															+	meshopt_SimplifyPermissive = 1 << 5,
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs
														
 
															+ */
														
 
															+enum
														
 
															+{
														
 
															+	/* Do not move this vertex. */
														
 
															+	meshopt_SimplifyVertex_Lock = 1 << 0,
														
 
															+	/* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */
														
 
															+	meshopt_SimplifyVertex_Protect = 1 << 1,
														
 
															 };
														
 
															 /**
														
 
															  * Mesh simplifier
														
 
															  * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
														
 
															  * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
														
 
															- * If not all attributes from the input mesh are required, it's recommended to reindex the mesh without them prior to simplification.
														
 
															+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
														
 
															  * Returns the number of indices after simplification, with destination containing new index data
														
 
															+ *
														
 
															  * The resulting index buffer references vertices from the original vertex buffer.
														
 
															- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															  *
														
 
															  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
														
 
															  * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
@@ -414,50 +449,86 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
 
															 /**
														
 
															  * Mesh simplifier with attribute metric
														
 
															- * The algorithm enhances meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
														
 
															- * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
														
 
															+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
														
 
															+ * Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order.
														
 
															+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
														
 
															+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
														
 
															+ * Returns the number of indices after simplification, with destination containing new index data
														
 
															+ *
														
 
															+ * The resulting index buffer references vertices from the original vertex buffer.
														
 
															+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															+ * Note that the number of attributes with non-zero weights affects memory requirements and running time.
														
 
															  *
														
 
															+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
														
 
															+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
 
															  * vertex_attributes should have attribute_count floats for each vertex
														
 
															  * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
														
 
															  * attribute_count must be <= 32
														
 
															  * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
														
 
															+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
														
 
															+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
														
 
															+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
														
 
															  */
														
 
															 MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
														
 
															+/**
														
 
															+ * Experimental: Mesh simplifier with position/attribute update
														
 
															+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
														
 
															+ * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
														
 
															+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
														
 
															+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
														
 
															+ * Returns the number of indices after simplification, indices are destructively updated with new index data
														
 
															+ *
														
 
															+ * The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place.
														
 
															+ * Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification.
														
 
															+ * Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated.
														
 
															+ *
														
 
															+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
 
															+ * vertex_attributes should have attribute_count floats for each vertex
														
 
															+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
														
 
															+ * attribute_count must be <= 32
														
 
															+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
														
 
															+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
														
 
															+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
														
 
															+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
														
 
															+ */
														
 
															+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
														
 
															+
														
 
															 /**
														
 
															  * Experimental: Mesh simplifier (sloppy)
														
 
															  * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
														
 
															  * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
														
 
															  * Returns the number of indices after simplification, with destination containing new index data
														
 
															  * The resulting index buffer references vertices from the original vertex buffer.
														
 
															- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															  *
														
 
															  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
														
 
															  * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
 
															+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position
														
 
															  * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
														
 
															  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
														
 
															  */
														
 
															-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
														
 
															+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
														
 
															 /**
														
 
															- * Experimental: Mesh simplifier (pruner)
														
 
															+ * Mesh simplifier (pruner)
														
 
															  * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh
														
 
															  * Returns the number of indices after simplification, with destination containing new index data
														
 
															  * The resulting index buffer references vertices from the original vertex buffer.
														
 
															- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															  *
														
 
															  * destination must contain enough space for the target index buffer, worst case is index_count elements
														
 
															  * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
 
															  * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
														
 
															  */
														
 
															-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
														
 
															+MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
														
 
															 /**
														
 
															  * Point cloud simplifier
														
 
															  * Reduces the number of points in the cloud to reach the given target
														
 
															  * Returns the number of points after simplification, with destination containing new index data
														
 
															  * The resulting index buffer references vertices from the original vertex buffer.
														
 
															- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
														
 
															  *
														
 
															  * destination must contain enough space for the target index buffer (target_vertex_count elements)
														
 
															  * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
@@ -548,12 +619,12 @@ struct meshopt_CoverageStatistics
 
															 };
														
 
															 /**
														
 
															- * Experimental: Coverage analyzer
														
 
															+ * Coverage analyzer
														
 
															  * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer
														
 
															  *
														
 
															  * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
 
															  */
														
 
															-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
														
 
															+MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
														
 
															 /**
														
 
															  * Meshlet is a small mesh cluster (subset) that consists of:
														
@@ -674,26 +745,26 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsig
 
															 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
														
 
															 /**
														
 
															- * Experimental: Sphere bounds generator
														
 
															+ * Sphere bounds generator
														
 
															  * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0.
														
 
															  *
														
 
															  * positions should have float3 position in the first 12 bytes of each element
														
 
															  * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element
														
 
															  */
														
 
															-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
														
 
															+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
														
 
															 /**
														
 
															- * Experimental: Cluster partitioner
														
 
															+ * Cluster partitioner
														
 
															  * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
														
 
															  *
														
 
															- * destination must contain enough space for the resulting partiotion data (cluster_count elements)
														
 
															+ * destination must contain enough space for the resulting partition data (cluster_count elements)
														
 
															  * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
														
 
															  * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
														
 
															  * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
														
 
															  * vertex_positions should have float3 position in the first 12 bytes of each vertex (or can be NULL if not used)
														
 
															  * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger
														
 
															  */
														
 
															-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
														
 
															+MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
														
 
															 /**
														
 
															  * Spatial sorter
														
@@ -715,14 +786,14 @@ MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const
 
															 MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
														
 
															 /**
														
 
															- * Experimental: Spatial clusterizer
														
 
															+ * Spatial clusterizer
														
 
															  * Reorders points into clusters optimized for spatial locality, and generates a new index buffer.
														
 
															  * Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size.
														
 
															  *
														
 
															  * destination must contain enough space for the resulting index buffer (vertex_count elements)
														
 
															  * vertex_positions should have float3 position in the first 12 bytes of each vertex
														
 
															  */
														
 
															-MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size);
														
 
															+MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size);
														
 
															 /**
														
 
															  * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
														
@@ -829,6 +900,8 @@ inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_co
 
															 template <typename T>
														
 
															 inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
														
 
															 template <typename T>
														
 
															+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
														
 
															+template <typename T>
														
 
															 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
														
 
															 template <typename T>
														
 
															 inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
														
@@ -890,14 +963,21 @@ inline int meshopt_quantizeSnorm(float v, int N)
 
															 class meshopt_Allocator
														
 
															 {
														
 
															 public:
														
 
															-	template <typename T>
														
 
															-	struct StorageT
														
 
															+	struct Storage
														
 
															 	{
														
 
															-		static void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
														
 
															-		static void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
														
 
															+		void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
														
 
															+		void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
														
 
															 	};
														
 
															-	typedef StorageT<void> Storage;
														
 
															+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
														
 
															+	MESHOPTIMIZER_API static Storage& storage();
														
 
															+#else
														
 
															+	static Storage& storage()
														
 
															+	{
														
 
															+		static Storage s = {::operator new, ::operator delete };
														
 
															+		return s;
														
 
															+	}
														
 
															+#endif
														
 
															 	meshopt_Allocator()
														
 
															 	    : blocks()
														
@@ -908,14 +988,14 @@ public:
 
															 	~meshopt_Allocator()
														
 
															 	{
														
 
															 		for (size_t i = count; i > 0; --i)
														
 
															-			Storage::deallocate(blocks[i - 1]);
														
 
															+			storage().deallocate(blocks[i - 1]);
														
 
															 	}
														
 
															 	template <typename T>
														
 
															 	T* allocate(size_t size)
														
 
															 	{
														
 
															 		assert(count < sizeof(blocks) / sizeof(blocks[0]));
														
 
															-		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
														
 
															+		T* result = static_cast<T*>(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
														
 
															 		blocks[count++] = result;
														
 
															 		return result;
														
 
															 	}
														
@@ -923,7 +1003,7 @@ public:
 
															 	void deallocate(void* ptr)
														
 
															 	{
														
 
															 		assert(count > 0 && blocks[count - 1] == ptr);
														
 
															-		Storage::deallocate(ptr);
														
 
															+		storage().deallocate(ptr);
														
 
															 		count--;
														
 
															 	}
														
@@ -931,12 +1011,6 @@ private:
 
															 	void* blocks[24];
														
 
															 	size_t count;
														
 
															 };
														
 
															-
														
 
															-// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
														
 
															-template <typename T>
														
 
															-void* (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
														
 
															-template <typename T>
														
 
															-void (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
														
 
															 #endif
														
 
															 /* Inline implementation for C++ templated wrappers */
														
@@ -958,7 +1032,7 @@ struct meshopt_IndexAdapter<T, false>
 
															 	{
														
 
															 		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
														
 
															-		data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
														
 
															+		data = static_cast<unsigned int*>(meshopt_Allocator::storage().allocate(size));
														
 
															 		if (input)
														
 
															 		{
														
@@ -975,7 +1049,7 @@ struct meshopt_IndexAdapter<T, false>
 
															 				result[i] = T(data[i]);
														
 
															 		}
														
 
															-		meshopt_Allocator::Storage::deallocate(data);
														
 
															+		meshopt_Allocator::storage().deallocate(data);
														
 
															 	}
														
 
															 };
														
@@ -1197,13 +1271,21 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s
 
															 	return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
														
 
															 }
														
 
															+template <typename T>
														
 
															+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error)
														
 
															+{
														
 
															+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
														
 
															+
														
 
															+	return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
														
 
															+}
														
 
															+
														
 
															 template <typename T>
														
 
															 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
														
 
															 {
														
 
															 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
														
 
															 	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
														
 
															-	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
														
 
															+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
														
 
															 }
														
 
															 template <typename T>
														
--- a/thirdparty/meshoptimizer/overdrawoptimizer.cpp
+++ b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
@@ -10,24 +10,24 @@
 
															 namespace meshopt
														
 
															 {
														
 
															-static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
														
 
															+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
														
 
															 {
														
 
															 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
														
 
															 	float mesh_centroid[3] = {};
														
 
															-	for (size_t i = 0; i < index_count; ++i)
														
 
															+	for (size_t i = 0; i < vertex_count; ++i)
														
 
															 	{
														
 
															-		const float* p = vertex_positions + vertex_stride_float * indices[i];
														
 
															+		const float* p = vertex_positions + vertex_stride_float * i;
														
 
															 		mesh_centroid[0] += p[0];
														
 
															 		mesh_centroid[1] += p[1];
														
 
															 		mesh_centroid[2] += p[2];
														
 
															 	}
														
 
															-	mesh_centroid[0] /= index_count;
														
 
															-	mesh_centroid[1] /= index_count;
														
 
															-	mesh_centroid[2] /= index_count;
														
 
															+	mesh_centroid[0] /= float(vertex_count);
														
 
															+	mesh_centroid[1] /= float(vertex_count);
														
 
															+	mesh_centroid[2] /= float(vertex_count);
														
 
															 	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
														
 
															 	{
														
@@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
 
															 	// fill sort data
														
 
															 	float* sort_data = allocator.allocate<float>(cluster_count);
														
 
															-	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
														
 
															+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count);
														
 
															 	// sort clusters using sort data
														
 
															 	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
														
--- a/thirdparty/meshoptimizer/simplifier.cpp
+++ b/thirdparty/meshoptimizer/simplifier.cpp
--- a/thirdparty/meshoptimizer/vertexfilter.cpp
+++ b/thirdparty/meshoptimizer/vertexfilter.cpp
@@ -165,6 +165,47 @@ static void decodeFilterExp(unsigned int* data, size_t count)
 
															 		data[i] = u.ui;
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+template <typename ST, typename T>
														
 
															+static void decodeFilterColor(T* data, size_t count)
														
 
															+{
														
 
															+	const float max = float((1 << (sizeof(T) * 8)) - 1);
														
 
															+
														
 
															+	for (size_t i = 0; i < count; ++i)
														
 
															+	{
														
 
															+		// recover scale from alpha high bit
														
 
															+		int as = data[i * 4 + 3];
														
 
															+		as |= as >> 1;
														
 
															+		as |= as >> 2;
														
 
															+		as |= as >> 4;
														
 
															+		as |= as >> 8; // noop for 8-bit
														
 
															+
														
 
															+		// convert to RGB in fixed point (co/cg are sign extended)
														
 
															+		int y = data[i * 4 + 0], co = ST(data[i * 4 + 1]), cg = ST(data[i * 4 + 2]);
														
 
															+
														
 
															+		int r = y + co - cg;
														
 
															+		int g = y + cg;
														
 
															+		int b = y - co - cg;
														
 
															+
														
 
															+		// expand alpha by one bit to match other components
														
 
															+		int a = data[i * 4 + 3];
														
 
															+		a = ((a << 1) & as) | (a & 1);
														
 
															+
														
 
															+		// compute scaling factor
														
 
															+		float ss = max / float(as);
														
 
															+
														
 
															+		// rounded float->int
														
 
															+		int rf = int(float(r) * ss + 0.5f);
														
 
															+		int gf = int(float(g) * ss + 0.5f);
														
 
															+		int bf = int(float(b) * ss + 0.5f);
														
 
															+		int af = int(float(a) * ss + 0.5f);
														
 
															+
														
 
															+		data[i * 4 + 0] = T(rf);
														
 
															+		data[i * 4 + 1] = T(gf);
														
 
															+		data[i * 4 + 2] = T(bf);
														
 
															+		data[i * 4 + 3] = T(af);
														
 
															+	}
														
 
															+}
														
 
															 #endif
														
 
															 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
														
@@ -386,6 +427,105 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 
															 		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
														
 
															+{
														
 
															+	for (size_t i = 0; i < count; i += 4)
														
 
															+	{
														
 
															+		__m128i c4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
														
 
															+
														
 
															+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
														
 
															+		__m128i yf = _mm_and_si128(c4, _mm_set1_epi32(0xff));
														
 
															+		__m128i cof = _mm_srai_epi32(_mm_slli_epi32(c4, 16), 24);
														
 
															+		__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4, 8), 24);
														
 
															+		__m128i af = _mm_srli_epi32(c4, 24);
														
 
															+
														
 
															+		// recover scale from alpha high bit
														
 
															+		__m128i as = af;
														
 
															+		as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
														
 
															+		as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
														
 
															+		as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
														
 
															+
														
 
															+		// expand alpha by one bit to match other components
														
 
															+		af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
														
 
															+
														
 
															+		// compute scaling factor
														
 
															+		__m128 ss = _mm_mul_ps(_mm_set1_ps(255.f), _mm_rcp_ps(_mm_cvtepi32_ps(as)));
														
 
															+
														
 
															+		// convert to RGB in fixed point
														
 
															+		__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
														
 
															+		__m128i gf = _mm_add_epi32(yf, cgf);
														
 
															+		__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
														
 
															+
														
 
															+		// rounded signed float->int
														
 
															+		__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
														
 
															+		__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
														
 
															+		__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
														
 
															+		__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
														
 
															+
														
 
															+		// repack rgba into final value
														
 
															+		__m128i res = rr;
														
 
															+		res = _mm_or_si128(res, _mm_slli_epi32(gr, 8));
														
 
															+		res = _mm_or_si128(res, _mm_slli_epi32(br, 16));
														
 
															+		res = _mm_or_si128(res, _mm_slli_epi32(ar, 24));
														
 
															+
														
 
															+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
														
 
															+{
														
 
															+	for (size_t i = 0; i < count; i += 4)
														
 
															+	{
														
 
															+		__m128i c4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]));
														
 
															+		__m128i c4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]));
														
 
															+
														
 
															+		// gather both y/co 16-bit pairs in each 32-bit lane
														
 
															+		__m128i c4_yco = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(2, 0, 2, 0)));
														
 
															+		__m128i c4_cga = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(3, 1, 3, 1)));
														
 
															+
														
 
															+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
														
 
															+		__m128i yf = _mm_and_si128(c4_yco, _mm_set1_epi32(0xffff));
														
 
															+		__m128i cof = _mm_srai_epi32(c4_yco, 16);
														
 
															+		__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4_cga, 16), 16);
														
 
															+		__m128i af = _mm_srli_epi32(c4_cga, 16);
														
 
															+
														
 
															+		// recover scale from alpha high bit
														
 
															+		__m128i as = af;
														
 
															+		as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
														
 
															+		as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
														
 
															+		as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
														
 
															+		as = _mm_or_si128(as, _mm_srli_epi32(as, 8));
														
 
															+
														
 
															+		// expand alpha by one bit to match other components
														
 
															+		af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
														
 
															+
														
 
															+		// compute scaling factor
														
 
															+		__m128 ss = _mm_div_ps(_mm_set1_ps(65535.f), _mm_cvtepi32_ps(as));
														
 
															+
														
 
															+		// convert to RGB in fixed point
														
 
															+		__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
														
 
															+		__m128i gf = _mm_add_epi32(yf, cgf);
														
 
															+		__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
														
 
															+
														
 
															+		// rounded signed float->int
														
 
															+		__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
														
 
															+		__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
														
 
															+		__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
														
 
															+		__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
														
 
															+
														
 
															+		// mix r/b and g/a to make 16-bit unpack easier
														
 
															+		__m128i rbr = _mm_or_si128(_mm_and_si128(rr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(br, 16));
														
 
															+		__m128i gar = _mm_or_si128(_mm_and_si128(gr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(ar, 16));
														
 
															+
														
 
															+		// pack r/g/b/a using 16-bit unpacks
														
 
															+		__m128i res_0 = _mm_unpacklo_epi16(rbr, gar);
														
 
															+		__m128i res_1 = _mm_unpackhi_epi16(rbr, gar);
														
 
															+
														
 
															+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
														
 
															+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
														
 
															+	}
														
 
															+}
														
 
															 #endif
														
 
															 #if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
														
@@ -596,6 +736,111 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 
															 		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
														
 
															+{
														
 
															+	for (size_t i = 0; i < count; i += 4)
														
 
															+	{
														
 
															+		int32x4_t c4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
														
 
															+
														
 
															+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
														
 
															+		int32x4_t yf = vandq_s32(c4, vdupq_n_s32(0xff));
														
 
															+		int32x4_t cof = vshrq_n_s32(vshlq_n_s32(c4, 16), 24);
														
 
															+		int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4, 8), 24);
														
 
															+		int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4), 24));
														
 
															+
														
 
															+		// recover scale from alpha high bit
														
 
															+		int32x4_t as = af;
														
 
															+		as = vorrq_s32(as, vshrq_n_s32(as, 1));
														
 
															+		as = vorrq_s32(as, vshrq_n_s32(as, 2));
														
 
															+		as = vorrq_s32(as, vshrq_n_s32(as, 4));
														
 
															+
														
 
															+		// expand alpha by one bit to match other components
														
 
															+		af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
														
 
															+
														
 
															+		// compute scaling factor
														
 
															+		float32x4_t ss = vmulq_f32(vdupq_n_f32(255.f), vrecpeq_f32(vcvtq_f32_s32(as)));
														
 
															+
														
 
															+		// convert to RGB in fixed point
														
 
															+		int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
														
 
															+		int32x4_t gf = vaddq_s32(yf, cgf);
														
 
															+		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
														
 
															+
														
 
															+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
														
 
															+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
														
 
															+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
														
 
															+
														
 
															+		int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
														
 
															+		int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
														
 
															+		int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
														
 
															+		int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
														
 
															+
														
 
															+		// repack rgba into final value
														
 
															+		int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff));
														
 
															+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8));
														
 
															+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16));
														
 
															+		res = vorrq_s32(res, vshlq_n_s32(ar, 24));
														
 
															+
														
 
															+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
														
 
															+{
														
 
															+	for (size_t i = 0; i < count; i += 4)
														
 
															+	{
														
 
															+		int32x4_t c4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
														
 
															+		int32x4_t c4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
														
 
															+
														
 
															+		// gather both y/co 16-bit pairs in each 32-bit lane
														
 
															+		int32x4_t c4_yco = vuzpq_s32(c4_0, c4_1).val[0];
														
 
															+		int32x4_t c4_cga = vuzpq_s32(c4_0, c4_1).val[1];
														
 
															+
														
 
															+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
														
 
															+		int32x4_t yf = vandq_s32(c4_yco, vdupq_n_s32(0xffff));
														
 
															+		int32x4_t cof = vshrq_n_s32(c4_yco, 16);
														
 
															+		int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4_cga, 16), 16);
														
 
															+		int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4_cga), 16));
														
 
															+
														
 
															+		// recover scale from alpha high bit
														
 
															+		int32x4_t as = af;
														
 
															+		as = vorrq_s32(as, vshrq_n_s32(as, 1));
														
 
															+		as = vorrq_s32(as, vshrq_n_s32(as, 2));
														
 
															+		as = vorrq_s32(as, vshrq_n_s32(as, 4));
														
 
															+		as = vorrq_s32(as, vshrq_n_s32(as, 8));
														
 
															+
														
 
															+		// expand alpha by one bit to match other components
														
 
															+		af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
														
 
															+
														
 
															+		// compute scaling factor
														
 
															+		float32x4_t ss = vdivq_f32(vdupq_n_f32(65535.f), vcvtq_f32_s32(as));
														
 
															+
														
 
															+		// convert to RGB in fixed point
														
 
															+		int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
														
 
															+		int32x4_t gf = vaddq_s32(yf, cgf);
														
 
															+		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
														
 
															+
														
 
															+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
														
 
															+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
														
 
															+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
														
 
															+
														
 
															+		int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
														
 
															+		int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
														
 
															+		int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
														
 
															+		int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
														
 
															+
														
 
															+		// mix r/b and g/a to make 16-bit unpack easier
														
 
															+		int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16));
														
 
															+		int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16));
														
 
															+
														
 
															+		// pack r/g/b/a using 16-bit unpacks
														
 
															+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
														
 
															+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[1]);
														
 
															+
														
 
															+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
														
 
															+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
														
 
															+	}
														
 
															+}
														
 
															 #endif
														
 
															 #ifdef SIMD_WASM
														
@@ -651,7 +896,8 @@ static void decodeFilterOctSimd8(signed char* data, size_t count)
 
															 static void decodeFilterOctSimd16(short* data, size_t count)
														
 
															 {
														
 
															 	const v128_t sign = wasm_f32x4_splat(-0.f);
														
 
															-	const v128_t zmask = wasm_i32x4_splat(0x7fff);
														
 
															+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
														
 
															+	volatile v128_t zmask = wasm_i32x4_splat(0x7fff);
														
 
															 	for (size_t i = 0; i < count; i += 4)
														
 
															 	{
														
@@ -763,8 +1009,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
															 		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
														
 
															 		// compute component index shifted left by 4 (and moved into i32x4 slot)
														
 
															-		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
														
 
															-		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
														
 
															+		v128_t cm = wasm_i32x4_shl(cf, 4);
														
 
															 		// rotate and store
														
 
															 		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
														
@@ -795,6 +1040,117 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 
															 		wasm_v128_store(&data[i], r);
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
														
 
															+{
														
 
															+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
														
 
															+	volatile v128_t zero = wasm_i32x4_splat(0);
														
 
															+
														
 
															+	for (size_t i = 0; i < count; i += 4)
														
 
															+	{
														
 
															+		v128_t c4 = wasm_v128_load(&data[i * 4]);
														
 
															+
														
 
															+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
														
 
															+		v128_t yf = wasm_v128_and(c4, wasm_i32x4_splat(0xff));
														
 
															+		v128_t cof = wasm_i32x4_shr(wasm_i32x4_shl(c4, 16), 24);
														
 
															+		v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4, 8), 24);
														
 
															+		v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4, 24));
														
 
															+
														
 
															+		// recover scale from alpha high bit
														
 
															+		v128_t as = af;
														
 
															+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
														
 
															+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
														
 
															+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
														
 
															+
														
 
															+		// expand alpha by one bit to match other components
														
 
															+		af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
														
 
															+
														
 
															+		// compute scaling factor
														
 
															+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(255.f), wasm_f32x4_convert_i32x4(as));
														
 
															+
														
 
															+		// convert to RGB in fixed point
														
 
															+		v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
														
 
															+		v128_t gf = wasm_i32x4_add(yf, cgf);
														
 
															+		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
														
 
															+
														
 
															+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
														
 
															+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
														
 
															+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
														
 
															+
														
 
															+		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
														
 
															+		v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
														
 
															+		v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
														
 
															+		v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
														
 
															+
														
 
															+		// repack rgba into final value
														
 
															+		v128_t res = wasm_v128_and(rr, wasm_i32x4_splat(0xff));
														
 
															+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(gr, wasm_i32x4_splat(0xff)), 8));
														
 
															+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(br, wasm_i32x4_splat(0xff)), 16));
														
 
															+		res = wasm_v128_or(res, wasm_i32x4_shl(ar, 24));
														
 
															+
														
 
															+		wasm_v128_store(&data[i * 4], res);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
														
 
															+{
														
 
															+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
														
 
															+	volatile v128_t zero = wasm_i32x4_splat(0);
														
 
															+
														
 
															+	for (size_t i = 0; i < count; i += 4)
														
 
															+	{
														
 
															+		v128_t c4_0 = wasm_v128_load(&data[(i + 0) * 4]);
														
 
															+		v128_t c4_1 = wasm_v128_load(&data[(i + 2) * 4]);
														
 
															+
														
 
															+		// gather both y/co 16-bit pairs in each 32-bit lane
														
 
															+		v128_t c4_yco = wasmx_unziplo_v32x4(c4_0, c4_1);
														
 
															+		v128_t c4_cga = wasmx_unziphi_v32x4(c4_0, c4_1);
														
 
															+
														
 
															+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
														
 
															+		v128_t yf = wasm_v128_and(c4_yco, wasm_i32x4_splat(0xffff));
														
 
															+		v128_t cof = wasm_i32x4_shr(c4_yco, 16);
														
 
															+		v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4_cga, 16), 16);
														
 
															+		v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4_cga, 16));
														
 
															+
														
 
															+		// recover scale from alpha high bit
														
 
															+		v128_t as = af;
														
 
															+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
														
 
															+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
														
 
															+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
														
 
															+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 8));
														
 
															+
														
 
															+		// expand alpha by one bit to match other components
														
 
															+		af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
														
 
															+
														
 
															+		// compute scaling factor
														
 
															+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(65535.f), wasm_f32x4_convert_i32x4(as));
														
 
															+
														
 
															+		// convert to RGB in fixed point
														
 
															+		v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
														
 
															+		v128_t gf = wasm_i32x4_add(yf, cgf);
														
 
															+		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
														
 
															+
														
 
															+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
														
 
															+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
														
 
															+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
														
 
															+
														
 
															+		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
														
 
															+		v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
														
 
															+		v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
														
 
															+		v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
														
 
															+
														
 
															+		// mix r/b and g/a to make 16-bit unpack easier
														
 
															+		v128_t rbr = wasm_v128_or(wasm_v128_and(rr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(br, 16));
														
 
															+		v128_t gar = wasm_v128_or(wasm_v128_and(gr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(ar, 16));
														
 
															+
														
 
															+		// pack r/g/b/a using 16-bit unpacks
														
 
															+		v128_t res_0 = wasmx_unpacklo_v16x8(rbr, gar);
														
 
															+		v128_t res_1 = wasmx_unpackhi_v16x8(rbr, gar);
														
 
															+
														
 
															+		wasm_v128_store(&data[(i + 0) * 4], res_0);
														
 
															+		wasm_v128_store(&data[(i + 2) * 4], res_1);
														
 
															+	}
														
 
															+}
														
 
															 #endif
														
 
															 // optimized variant of frexp
														
@@ -872,6 +1228,25 @@ void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
 
															 #endif
														
 
															 }
														
 
															+void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride)
														
 
															+{
														
 
															+	using namespace meshopt;
														
 
															+
														
 
															+	assert(stride == 4 || stride == 8);
														
 
															+
														
 
															+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
														
 
															+	if (stride == 4)
														
 
															+		dispatchSimd(decodeFilterColorSimd8, static_cast<unsigned char*>(buffer), count, 4);
														
 
															+	else
														
 
															+		dispatchSimd(decodeFilterColorSimd16, static_cast<unsigned short*>(buffer), count, 4);
														
 
															+#else
														
 
															+	if (stride == 4)
														
 
															+		decodeFilterColor<signed char>(static_cast<unsigned char*>(buffer), count);
														
 
															+	else
														
 
															+		decodeFilterColor<short>(static_cast<unsigned short*>(buffer), count);
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															 void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
														
 
															 {
														
 
															 	assert(stride == 4 || stride == 8);
														
@@ -1042,6 +1417,51 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 
															 	}
														
 
															 }
														
 
															+void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data)
														
 
															+{
														
 
															+	assert(stride == 4 || stride == 8);
														
 
															+	assert(bits >= 2 && bits <= 16);
														
 
															+
														
 
															+	unsigned char* d8 = static_cast<unsigned char*>(destination);
														
 
															+	unsigned short* d16 = static_cast<unsigned short*>(destination);
														
 
															+
														
 
															+	for (size_t i = 0; i < count; ++i)
														
 
															+	{
														
 
															+		const float* c = &data[i * 4];
														
 
															+
														
 
															+		int fr = meshopt_quantizeUnorm(c[0], bits);
														
 
															+		int fg = meshopt_quantizeUnorm(c[1], bits);
														
 
															+		int fb = meshopt_quantizeUnorm(c[2], bits);
														
 
															+
														
 
															+		// YCoCg-R encoding with truncated Co/Cg ensures that decoding can be done using integers
														
 
															+		int fco = (fr - fb) / 2;
														
 
															+		int tmp = fb + fco;
														
 
															+		int fcg = (fg - tmp) / 2;
														
 
															+		int fy = tmp + fcg;
														
 
															+
														
 
															+		// validate that R/G/B can be reconstructed with K bit integers
														
 
															+		assert(unsigned((fy + fco - fcg) | (fy + fcg) | (fy - fco - fcg)) < (1u << bits));
														
 
															+
														
 
															+		// alpha: K-1-bit encoding with high bit set to 1
														
 
															+		int fa = meshopt_quantizeUnorm(c[3], bits - 1) | (1 << (bits - 1));
														
 
															+
														
 
															+		if (stride == 4)
														
 
															+		{
														
 
															+			d8[i * 4 + 0] = (unsigned char)(fy);
														
 
															+			d8[i * 4 + 1] = (unsigned char)(fco);
														
 
															+			d8[i * 4 + 2] = (unsigned char)(fcg);
														
 
															+			d8[i * 4 + 3] = (unsigned char)(fa);
														
 
															+		}
														
 
															+		else
														
 
															+		{
														
 
															+			d16[i * 4 + 0] = (unsigned short)(fy);
														
 
															+			d16[i * 4 + 1] = (unsigned short)(fco);
														
 
															+			d16[i * 4 + 2] = (unsigned short)(fcg);
														
 
															+			d16[i * 4 + 3] = (unsigned short)(fa);
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 #undef SIMD_SSE
														
 
															 #undef SIMD_NEON
														
 
															 #undef SIMD_WASM