5 anni fa · fd94855c7f
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * meshoptimizer - version 0.13
			
 
				+ * meshoptimizer - version 0.14
			
 
				  *
			
 
				  * Copyright (C) 2016-2020, by Arseny Kapoulkine ([email protected])
			
 
				  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
			
@@ -12,7 +12,7 @@
 
				 #include <stddef.h>
			
 
				 
			
 
				 /* Version macro; major * 1000 + minor * 10 + patch */
			
 
				-#define MESHOPTIMIZER_VERSION 130
			
 
				+#define MESHOPTIMIZER_VERSION 140
			
 
				 
			
 
				 /* If no API is defined, assume default */
			
 
				 #ifndef MESHOPTIMIZER_API
			
--- a/3rdparty/meshoptimizer/src/vertexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp
@@ -4,37 +4,45 @@
 
				 #include <assert.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
			
 
				-#define SIMD_NEON
			
 
				-#endif
			
 
				+// The block below auto-detects SIMD ISA that can be used on the target platform
			
 
				+#ifndef MESHOPTIMIZER_NO_SIMD
			
 
				 
			
 
				+// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
			
 
				 #if defined(__AVX__) || defined(__SSSE3__)
			
 
				 #define SIMD_SSE
			
 
				 #endif
			
 
				 
			
 
				+// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
			
 
				 #if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
			
 
				 #undef SIMD_SSE
			
 
				 #define SIMD_AVX
			
 
				 #endif
			
 
				 
			
 
				+// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
			
 
				 #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
			
 
				 #define SIMD_SSE
			
 
				 #define SIMD_FALLBACK
			
 
				-#include <intrin.h> // __cpuid
			
 
				 #endif
			
 
				 
			
 
				-// GCC 4.9+ and clang 3.8+ support targeting SIMD instruction sets from individual functions
			
 
				+// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
			
 
				 #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
			
 
				 #define SIMD_SSE
			
 
				 #define SIMD_FALLBACK
			
 
				 #define SIMD_TARGET __attribute__((target("ssse3")))
			
 
				-#include <cpuid.h> // __cpuid
			
 
				 #endif
			
 
				 
			
 
				+// GCC/clang define these when NEON support is available
			
 
				+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// On MSVC, we assume that ARM builds always target NEON-capable devices
			
 
				 #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
			
 
				 #define SIMD_NEON
			
 
				 #endif
			
 
				 
			
 
				+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
			
 
				+// Note that we need unimplemented-simd128 subset for a few functions that are implemented de-facto
			
 
				 #if defined(__wasm_simd128__)
			
 
				 #define SIMD_WASM
			
 
				 #define SIMD_TARGET __attribute__((target("unimplemented-simd128")))
			
@@ -44,10 +52,20 @@
 
				 #define SIMD_TARGET
			
 
				 #endif
			
 
				 
			
 
				+#endif // !MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				 #ifdef SIMD_SSE
			
 
				 #include <tmmintrin.h>
			
 
				 #endif
			
 
				 
			
 
				+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
			
 
				+#ifdef _MSC_VER
			
 
				+#include <intrin.h> // __cpuid
			
 
				+#else
			
 
				+#include <cpuid.h> // __cpuid
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				 #ifdef SIMD_AVX
			
 
				 #include <immintrin.h>
			
 
				 #endif
			
@@ -1066,7 +1084,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 
				 static unsigned int getCpuFeatures()
			
 
				 {
			
 
				 	int cpuinfo[4] = {};
			
 
				-#if defined(_MSC_VER) && !defined(__clang__)
			
 
				+#ifdef _MSC_VER
			
 
				 	__cpuid(cpuinfo, 1);
			
 
				 #else
			
 
				 	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
			
--- a/3rdparty/meshoptimizer/src/vertexfilter.cpp
+++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp
@@ -3,10 +3,53 @@
 
				 
			
 
				 #include <math.h>
			
 
				 
			
 
				+// The block below auto-detects SIMD ISA that can be used on the target platform
			
 
				+#ifndef MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				+// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
			
 
				+#if defined(__SSE2__)
			
 
				+#define SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
			
 
				+#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
			
 
				+#define SIMD_SSE
			
 
				+#endif
			
 
				+
			
 
				+// GCC/clang define these when NEON support is available
			
 
				+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// On MSVC, we assume that ARM builds always target NEON-capable devices
			
 
				+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
			
 
				+#define SIMD_NEON
			
 
				+#endif
			
 
				+
			
 
				+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
			
 
				 #if defined(__wasm_simd128__)
			
 
				 #define SIMD_WASM
			
 
				 #endif
			
 
				 
			
 
				+#endif // !MESHOPTIMIZER_NO_SIMD
			
 
				+
			
 
				+#ifdef SIMD_SSE
			
 
				+#include <emmintrin.h>
			
 
				+#include <stdint.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef _MSC_VER
			
 
				+#include <intrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+#if defined(_MSC_VER) && defined(_M_ARM64)
			
 
				+#include <arm64_neon.h>
			
 
				+#else
			
 
				+#include <arm_neon.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				 #ifdef SIMD_WASM
			
 
				 #include <wasm_simd128.h>
			
 
				 #endif
			
@@ -21,7 +64,7 @@
 
				 namespace meshopt
			
 
				 {
			
 
				 
			
 
				-#if !defined(SIMD_WASM)
			
 
				+#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
			
 
				 template <typename T>
			
 
				 static void decodeFilterOct(T* data, size_t count)
			
 
				 {
			
@@ -59,13 +102,6 @@ static void decodeFilterQuat(short* data, size_t count)
 
				 {
			
 
				 	const float scale = 1.f / sqrtf(2.f);
			
 
				 
			
 
				-	static const int order[4][4] = {
			
 
				-	    {1, 2, 3, 0},
			
 
				-	    {2, 3, 0, 1},
			
 
				-	    {3, 0, 1, 2},
			
 
				-	    {0, 1, 2, 3},
			
 
				-	};
			
 
				-
			
 
				 	for (size_t i = 0; i < count; ++i)
			
 
				 	{
			
 
				 		// recover scale from the high byte of the component
			
@@ -90,10 +126,10 @@ static void decodeFilterQuat(short* data, size_t count)
 
				 		int qc = data[i * 4 + 3] & 3;
			
 
				 
			
 
				 		// output order is dictated by input index
			
 
				-		data[i * 4 + order[qc][0]] = short(xf);
			
 
				-		data[i * 4 + order[qc][1]] = short(yf);
			
 
				-		data[i * 4 + order[qc][2]] = short(zf);
			
 
				-		data[i * 4 + order[qc][3]] = short(wf);
			
 
				+		data[i * 4 + ((qc + 1) & 3)] = short(xf);
			
 
				+		data[i * 4 + ((qc + 2) & 3)] = short(yf);
			
 
				+		data[i * 4 + ((qc + 3) & 3)] = short(zf);
			
 
				+		data[i * 4 + ((qc + 0) & 3)] = short(wf);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -121,6 +157,418 @@ static void decodeFilterExp(unsigned int* data, size_t count)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				+inline uint64_t rotateleft64(uint64_t v, int x)
			
 
				+{
			
 
				+#if defined(_MSC_VER) && !defined(__clang__)
			
 
				+	return _rotl64(v, x);
			
 
				+// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
			
 
				+// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
			
 
				+#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
			
 
				+	return __builtin_rotateleft64(v, x);
			
 
				+#else
			
 
				+	return (v << (x & 63)) | (v >> ((64 - x) & 63));
			
 
				+#endif
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_SSE
			
 
				+static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				+{
			
 
				+	const __m128 sign = _mm_set1_ps(-0.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
			
 
				+		__m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
			
 
				+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		__m128 x = _mm_cvtepi32_ps(xf);
			
 
				+		__m128 y = _mm_cvtepi32_ps(yf);
			
 
				+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
			
 
				+
			
 
				+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
			
 
				+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
			
 
				+		__m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+
			
 
				+		// combine xr/yr/zr into final value
			
 
				+		__m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
			
 
				+		res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
			
 
				+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterOctSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const __m128 sign = _mm_set1_ps(-0.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
			
 
				+		__m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		__m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
			
 
				+		__m128i yf = _mm_srai_epi32(n4, 16);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we don't need to sign extend it
			
 
				+		__m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
			
 
				+		__m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		__m128 x = _mm_cvtepi32_ps(xf);
			
 
				+		__m128 y = _mm_cvtepi32_ps(yf);
			
 
				+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
			
 
				+
			
 
				+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
			
 
				+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
			
 
				+		__m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+
			
 
				+		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
			
 
				+		__m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
			
 
				+
			
 
				+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
 
				+		__m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
			
 
				+		__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
			
 
				+
			
 
				+		// patch in .w
			
 
				+		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
			
 
				+		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
			
 
				+
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
			
 
				+		__m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		__m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
			
 
				+		__m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
			
 
				+		__m128i yf = _mm_srai_epi32(q4_xy, 16);
			
 
				+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
			
 
				+		__m128i cf = _mm_srai_epi32(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
			
 
				+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
			
 
				+		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
			
 
				+		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
			
 
				+		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
			
 
				+
			
 
				+		__m128 s = _mm_set1_ps(32767.f);
			
 
				+
			
 
				+		// rounded signed float->int
			
 
				+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
			
 
				+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
			
 
				+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
			
 
				+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
			
 
				+
			
 
				+		// mix x/z and w/y to make 16-bit unpack easier
			
 
				+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
			
 
				+		__m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
			
 
				+
			
 
				+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				+		__m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
			
 
				+		__m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
			
 
				+
			
 
				+		// store results to stack so that we can rotate using scalar instructions
			
 
				+		uint64_t res[4];
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
			
 
				+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
			
 
				+
			
 
				+		// rotate and store
			
 
				+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
			
 
				+
			
 
				+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
			
 
				+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
			
 
				+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
			
 
				+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExpSimd(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		__m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
			
 
				+
			
 
				+		// decode exponent into 2^x directly
			
 
				+		__m128i ef = _mm_srai_epi32(v, 24);
			
 
				+		__m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
			
 
				+
			
 
				+		// decode 24-bit mantissa into floating-point value
			
 
				+		__m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
			
 
				+		__m128 m = _mm_cvtepi32_ps(mf);
			
 
				+
			
 
				+		__m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
			
 
				+
			
 
				+		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
			
 
				+inline float32x4_t vsqrtq_f32(float32x4_t x)
			
 
				+{
			
 
				+	float32x4_t r = vrsqrteq_f32(x);
			
 
				+	r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
			
 
				+	return vmulq_f32(r, x);
			
 
				+}
			
 
				+
			
 
				+inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
			
 
				+{
			
 
				+	float32x4_t r = vrecpeq_f32(y);
			
 
				+	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
			
 
				+	return vmulq_f32(x, r);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SIMD_NEON
			
 
				+static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				+{
			
 
				+	const int32x4_t sign = vdupq_n_s32(0x80000000);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
			
 
				+		int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
			
 
				+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		float32x4_t x = vcvtq_f32_s32(xf);
			
 
				+		float32x4_t y = vcvtq_f32_s32(yf);
			
 
				+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
			
 
				+
			
 
				+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
			
 
				+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
			
 
				+		float32x4_t rl = vrsqrteq_f32(ll);
			
 
				+		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+
			
 
				+		// combine xr/yr/zr into final value
			
 
				+		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
			
 
				+		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
			
 
				+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
			
 
				+
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterOctSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const int32x4_t sign = vdupq_n_s32(0x80000000);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
			
 
				+		int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
			
 
				+		int32x4_t yf = vshrq_n_s32(n4, 16);
			
 
				+
			
 
				+		// unpack z; note that z is unsigned so we don't need to sign extend it
			
 
				+		int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
			
 
				+		int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
			
 
				+
			
 
				+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				+		float32x4_t x = vcvtq_f32_s32(xf);
			
 
				+		float32x4_t y = vcvtq_f32_s32(yf);
			
 
				+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
			
 
				+
			
 
				+		// fixup octahedral coordinates for z<0
			
 
				+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
			
 
				+
			
 
				+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
			
 
				+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
			
 
				+
			
 
				+		// compute normal length & scale
			
 
				+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
			
 
				+		float32x4_t rl = vrsqrteq_f32(ll);
			
 
				+		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
			
 
				+		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+
			
 
				+		// mix x/z and y/0 to make 16-bit unpack easier
			
 
				+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
			
 
				+		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
			
 
				+
			
 
				+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
			
 
				+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
			
 
				+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
			
 
				+
			
 
				+		// patch in .w
			
 
				+		res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
			
 
				+		res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
			
 
				+
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
			
 
				+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				+{
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				+
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
			
 
				+		int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
			
 
				+
			
 
				+		// gather both x/y 16-bit pairs in each 32-bit lane
			
 
				+		int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
			
 
				+		int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
			
 
				+
			
 
				+		// sign-extends each of x,y in [x y] with arithmetic shifts
			
 
				+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
			
 
				+		int32x4_t yf = vshrq_n_s32(q4_xy, 16);
			
 
				+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
			
 
				+		int32x4_t cf = vshrq_n_s32(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
			
 
				+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
			
 
				+
			
 
				+		// convert x/y/z to [-1..1] (scaled...)
			
 
				+		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
			
 
				+		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
			
 
				+		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
			
 
				+
			
 
				+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				+		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
			
 
				+		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
			
 
				+
			
 
				+		float32x4_t s = vdupq_n_f32(32767.f);
			
 
				+
			
 
				+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
 
				+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
			
 
				+
			
 
				+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
			
 
				+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
			
 
				+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
			
 
				+		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
			
 
				+
			
 
				+		// mix x/z and w/y to make 16-bit unpack easier
			
 
				+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
			
 
				+		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
			
 
				+
			
 
				+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
			
 
				+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
			
 
				+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
			
 
				+
			
 
				+		// rotate and store
			
 
				+		uint64_t* out = (uint64_t*)&data[i * 4];
			
 
				+
			
 
				+		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
			
 
				+		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
			
 
				+		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
			
 
				+		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void decodeFilterExpSimd(unsigned int* data, size_t count)
			
 
				+{
			
 
				+	for (size_t i = 0; i < count; i += 4)
			
 
				+	{
			
 
				+		int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
			
 
				+
			
 
				+		// decode exponent into 2^x directly
			
 
				+		int32x4_t ef = vshrq_n_s32(v, 24);
			
 
				+		int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
			
 
				+
			
 
				+		// decode 24-bit mantissa into floating-point value
			
 
				+		int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
			
 
				+		float32x4_t m = vcvtq_f32_s32(mf);
			
 
				+
			
 
				+		float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
			
 
				+
			
 
				+		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 #ifdef SIMD_WASM
			
 
				 static void decodeFilterOctSimd(signed char* data, size_t count)
			
 
				 {
			
@@ -140,19 +588,18 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
 
				 		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				 		v128_t x = wasm_f32x4_convert_i32x4(xf);
			
 
				 		v128_t y = wasm_f32x4_convert_i32x4(yf);
			
 
				-		// TODO: when i32x4_abs is available it might be faster, f32x4_abs is 3 instructions in v8
			
 
				 		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
			
 
				 
			
 
				 		// fixup octahedral coordinates for z<0
			
 
				-		// note: i32x4_min_s with 0 is equvalent to f32x4_min
			
 
				+		// note: i32x4_min with 0 is equvalent to f32x4_min
			
 
				 		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
			
 
				 
			
 
				 		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
			
 
				 		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
			
 
				 
			
 
				 		// compute normal length & scale
			
 
				-		v128_t l = wasm_f32x4_sqrt(wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
			
 
				-		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), l);
			
 
				+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
			
 
				+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
			
 
				 
			
 
				 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				 		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
			
@@ -196,19 +643,18 @@ static void decodeFilterOctSimd(short* data, size_t count)
 
				 		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
			
 
				 		v128_t x = wasm_f32x4_convert_i32x4(xf);
			
 
				 		v128_t y = wasm_f32x4_convert_i32x4(yf);
			
 
				-		// TODO: when i32x4_abs is available it might be faster, f32x4_abs is 3 instructions in v8
			
 
				 		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
			
 
				 
			
 
				 		// fixup octahedral coordinates for z<0
			
 
				-		// note: i32x4_min_s with 0 is equvalent to f32x4_min
			
 
				+		// note: i32x4_min with 0 is equvalent to f32x4_min
			
 
				 		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
			
 
				 
			
 
				 		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
			
 
				 		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
			
 
				 
			
 
				 		// compute normal length & scale
			
 
				-		v128_t l = wasm_f32x4_sqrt(wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
			
 
				-		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), l);
			
 
				+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
			
 
				+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
			
 
				 
			
 
				 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
			
 
				 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
			
@@ -227,7 +673,6 @@ static void decodeFilterOctSimd(short* data, size_t count)
 
				 		v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
			
 
				 
			
 
				 		// patch in .w
			
 
				-		// TODO: this can use pblendw-like shuffles and we can remove y0r - once LLVM fixes shuffle merging
			
 
				 		res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
			
 
				 		res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
			
 
				 
			
@@ -265,7 +710,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
				 		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
			
 
				 
			
 
				 		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				-		// note: i32x4_max_s with 0 is equivalent to f32x4_max
			
 
				+		// note: i32x4_max with 0 is equivalent to f32x4_max
			
 
				 		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
			
 
				 		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
			
 
				 
			
@@ -292,12 +737,12 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
				 		v128_t cm = wasm_i32x4_shl(cf, 4);
			
 
				 
			
 
				 		// rotate and store
			
 
				-		uint64_t* out = (uint64_t*)&data[i * 4];
			
 
				+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
			
 
				 
			
 
				-		out[0] = __builtin_rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
			
 
				-		out[1] = __builtin_rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
			
 
				-		out[2] = __builtin_rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
			
 
				-		out[3] = __builtin_rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
			
 
				+		out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
			
 
				+		out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
			
 
				+		out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
			
 
				+		out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -331,7 +776,7 @@ void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_si
 
				 	assert(vertex_count % 4 == 0);
			
 
				 	assert(vertex_size == 4 || vertex_size == 8);
			
 
				 
			
 
				-#if defined(SIMD_WASM)
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				 	if (vertex_size == 4)
			
 
				 		decodeFilterOctSimd(static_cast<signed char*>(buffer), vertex_count);
			
 
				 	else
			
@@ -352,7 +797,7 @@ void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_s
 
				 	assert(vertex_size == 8);
			
 
				 	(void)vertex_size;
			
 
				 
			
 
				-#if defined(SIMD_WASM)
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				 	decodeFilterQuatSimd(static_cast<short*>(buffer), vertex_count);
			
 
				 #else
			
 
				 	decodeFilterQuat(static_cast<short*>(buffer), vertex_count);
			
@@ -366,11 +811,13 @@ void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_si
 
				 	assert(vertex_count % 4 == 0);
			
 
				 	assert(vertex_size % 4 == 0);
			
 
				 
			
 
				-#if defined(SIMD_WASM)
			
 
				+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
			
 
				 	decodeFilterExpSimd(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
			
 
				 #else
			
 
				 	decodeFilterExp(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+#undef SIMD_SSE
			
 
				+#undef SIMD_NEON
			
 
				 #undef SIMD_WASM