3 years ago · 9bb2dbdb70
--- a/3rdparty/meshoptimizer/src/clusterizer.cpp
+++ b/3rdparty/meshoptimizer/src/clusterizer.cpp
@@ -464,7 +464,7 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
			
@@ -687,7 +687,7 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				 	assert(index_count / 3 <= kMeshletMaxTriangles);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	(void)vertex_count;
			
@@ -839,7 +839,7 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(triangle_count <= kMeshletMaxTriangles);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	unsigned int indices[kMeshletMaxTriangles * 3];
			
--- a/3rdparty/meshoptimizer/src/indexgenerator.cpp
+++ b/3rdparty/meshoptimizer/src/indexgenerator.cpp
@@ -412,7 +412,7 @@ void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsig
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
@@ -483,7 +483,7 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -37,8 +37,8 @@ extern "C" {
 
				 #endif
			
 
				 
			
 
				 /**
			
 
				- * Vertex attribute stream, similar to glVertexPointer
			
 
				- * Each element takes size bytes, with stride controlling the spacing between successive elements.
			
 
				+ * Vertex attribute stream
			
 
				+ * Each element takes size bytes, beginning at data, with stride controlling the spacing between successive elements (stride >= size).
			
 
				  */
			
 
				 struct meshopt_Stream
			
 
				 {
			
@@ -115,7 +115,7 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* dest
 
				  * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count*2 elements)
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
@@ -131,7 +131,7 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
 
				  * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count*4 elements)
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
@@ -171,7 +171,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				  * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
			
 
				  */
			
 
				 MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
			
@@ -331,7 +331,7 @@ enum
 
				  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation
			
 
				  * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
			
 
				  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
@@ -347,7 +347,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
 
				  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation
			
 
				  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
			
 
				  */
			
@@ -361,7 +361,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati
 
				  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
			
 
				  *
			
 
				  * destination must contain enough space for the target index buffer (target_vertex_count elements)
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
			
 
				 
			
@@ -423,7 +423,7 @@ struct meshopt_OverdrawStatistics
 
				  * Returns overdraw statistics using a software rasterizer
			
 
				  * Results may not match actual GPU performance
			
 
				  *
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				 MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
@@ -461,7 +461,7 @@ struct meshopt_Meshlet
 
				  * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
			
 
				  * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
			
 
				  * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512)
			
 
				  * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
			
 
				  */
			
@@ -503,7 +503,7 @@ struct meshopt_Bounds
 
				  * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
			
 
				  * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
			
 
				  *
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
			
 
				  */
			
 
				 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
@@ -523,7 +523,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destinati
 
				  * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
			
 
				  *
			
 
				  * destination must contain enough space for the resulting index buffer (index_count elements)
			
 
				- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
			
 
				+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
			
 
				  */
			
 
				 MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
			
 
				 
			
--- a/3rdparty/meshoptimizer/src/overdrawanalyzer.cpp
+++ b/3rdparty/meshoptimizer/src/overdrawanalyzer.cpp
@@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
--- a/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp
+++ b/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp
@@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
--- a/3rdparty/meshoptimizer/src/simplifier.cpp
+++ b/3rdparty/meshoptimizer/src/simplifier.cpp
@@ -1282,7 +1282,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 	assert(target_index_count <= index_count);
			
 
				 	assert((options & ~(meshopt_SimplifyLockBorder)) == 0);
			
@@ -1425,7 +1425,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 	assert(target_index_count <= index_count);
			
 
				 
			
@@ -1556,7 +1556,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
 
				 {
			
 
				 	using namespace meshopt;
			
 
				 
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 	assert(target_vertex_count <= vertex_count);
			
 
				 
			
@@ -1668,7 +1668,7 @@ float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count,
 
				 {
			
 
				 	using namespace meshopt;
			
 
				 
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	float extent = rescalePositions(NULL, vertex_positions, vertex_count, vertex_positions_stride);
			
--- a/3rdparty/meshoptimizer/src/spatialorder.cpp
+++ b/3rdparty/meshoptimizer/src/spatialorder.cpp
@@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
 
				 {
			
 
				 	using namespace meshopt;
			
 
				 
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	meshopt_Allocator allocator;
			
@@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
 
				 	using namespace meshopt;
			
 
				 
			
 
				 	assert(index_count % 3 == 0);
			
 
				-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
			
 
				+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
			
 
				 	assert(vertex_positions_stride % sizeof(float) == 0);
			
 
				 
			
 
				 	(void)vertex_count;
			
--- a/3rdparty/meshoptimizer/src/vertexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp
@@ -50,6 +50,12 @@
 
				 #define SIMD_TARGET
			
 
				 #endif
			
 
				 
			
 
				+// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
			
 
				+// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
			
 
				+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
			
 
				+#define SIMD_LATENCYOPT
			
 
				+#endif
			
 
				+
			
 
				 #endif // !MESHOPTIMIZER_NO_SIMD
			
 
				 
			
 
				 #ifdef SIMD_SSE
			
@@ -472,6 +478,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 		typedef int unaligned_int;
			
 
				 #endif
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned int data32;
			
 
				+		memcpy(&data32, data, 4);
			
 
				+		data32 &= data32 >> 1;
			
 
				+
			
 
				+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
			
 
				+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				 		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
			
 
				 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
			
 
				 
			
@@ -490,11 +508,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 4 + datacnt;
			
 
				+#else
			
 
				 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	case 2:
			
 
				 	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned long long data64;
			
 
				+		memcpy(&data64, data, 8);
			
 
				+		data64 &= data64 >> 1;
			
 
				+		data64 &= data64 >> 2;
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				 		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
			
 
				 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
			
 
				 
			
@@ -512,7 +544,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 8 + datacnt;
			
 
				+#else
			
 
				 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	case 3:
			
@@ -604,24 +640,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
 
				 
			
 
				 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
			
 
				 {
			
 
				-	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
			
 
				-
			
 
				-	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
			
 
				-	uint8x16_t masked = vandq_u8(mask, byte_mask);
			
 
				+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
			
 
				+	const uint64_t magic = 0x000103070f1f3f80ull;
			
 
				 
			
 
				-#ifdef __aarch64__
			
 
				-	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
			
 
				-	mask0 = vaddv_u8(vget_low_u8(masked));
			
 
				-	mask1 = vaddv_u8(vget_high_u8(masked));
			
 
				-#else
			
 
				-	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
			
 
				-	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
			
 
				-	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
			
 
				-	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
			
 
				+	uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
			
 
				 
			
 
				-	mask0 = vget_lane_u8(sum3, 0);
			
 
				-	mask1 = vget_lane_u8(sum3, 1);
			
 
				-#endif
			
 
				+	mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
			
 
				+	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
			
 
				 }
			
 
				 
			
 
				 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
			
@@ -639,6 +664,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 	case 1:
			
 
				 	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned int data32;
			
 
				+		memcpy(&data32, data, 4);
			
 
				+		data32 &= data32 >> 1;
			
 
				+
			
 
				+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
			
 
				+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				 		uint8x8_t sel2 = vld1_u8(data);
			
 
				 		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
			
 
				 		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
			
@@ -655,11 +692,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 		vst1q_u8(buffer, result);
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 4 + datacnt;
			
 
				+#else
			
 
				 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	case 2:
			
 
				 	{
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		unsigned long long data64;
			
 
				+		memcpy(&data64, data, 8);
			
 
				+		data64 &= data64 >> 1;
			
 
				+		data64 &= data64 >> 2;
			
 
				+
			
 
				+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
			
 
				+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
			
 
				+#endif
			
 
				+
			
 
				 		uint8x8_t sel4 = vld1_u8(data);
			
 
				 		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
			
 
				 		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
			
@@ -675,7 +726,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
				 
			
 
				 		vst1q_u8(buffer, result);
			
 
				 
			
 
				+#ifdef SIMD_LATENCYOPT
			
 
				+		return data + 8 + datacnt;
			
 
				+#else
			
 
				 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	case 3:
			
@@ -715,7 +770,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 
				 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
			
 
				 	const uint64_t magic = 0x000103070f1f3f80ull;
			
 
				 
			
 
				-	// TODO: This can use v8x16_bitmask in the future
			
 
				 	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
			
 
				 	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
			
 
				 }
			
--- a/3rdparty/meshoptimizer/src/vertexfilter.cpp
+++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp
@@ -931,7 +931,7 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 
				 		const float* v = &data[i * stride_float];
			
 
				 		unsigned int* d = &destination[i * stride_float];
			
 
				 
			
 
				-		// use maximum exponent to encode values; this guarantess that mantissa is [-1, 1]
			
 
				+		// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
			
 
				 		int exp = -100;
			
 
				 
			
 
				 		for (size_t j = 0; j < stride_float; ++j)