5 years ago · 5d72520a1d
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -211,7 +211,7 @@ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t verte
 
				  * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
			
 
				  * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
			
 
				  *
			
 
				- * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with 12-bit component encoding and a 2-bit component index indicating which component to reconstruct.
			
 
				+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
			
 
				  * Each component is stored as an 16-bit integer; stride must be equal to 8.
			
 
				  */
			
 
				 MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size);
			
--- a/3rdparty/meshoptimizer/src/vertexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp
@@ -73,14 +73,13 @@
 
				 #endif
			
 
				 
			
 
				 #ifdef SIMD_WASM
			
 
				-#define wasmx_swizzle_v32x4(v, i, j, k, l) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * j, 4 * j + 1, 4 * j + 2, 4 * j + 3, 4 * k, 4 * k + 1, 4 * k + 2, 4 * k + 3, 4 * l, 4 * l + 1, 4 * l + 2, 4 * l + 3)
			
 
				-#define wasmx_splat_v32x4(v, i) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3)
			
 
				+#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
			
 
				 #define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
			
 
				 #define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
			
 
				-#define wasmx_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
			
 
				-#define wasmx_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
			
 
				-#define wasmx_unpacklo_v64x2(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23)
			
 
				-#define wasmx_unpackhi_v64x2(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31)
			
 
				+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
			
 
				+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
			
 
				+#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
			
 
				+#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
			
 
				 #endif
			
 
				 
			
 
				 #if defined(SIMD_WASM)
			
@@ -743,7 +742,7 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 
				 SIMD_TARGET
			
 
				 static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
			
 
				 {
			
 
				-	v128_t mask_0 = wasmx_swizzle_v32x4(mask, 0, 2, 1, 3);
			
 
				+	v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
			
 
				 
			
 
				 	// TODO: when Chrome supports v128.const we can try doing vectorized and?
			
 
				 	uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
			
@@ -907,7 +906,8 @@ SIMD_TARGET
 
				 static v128_t unzigzag8(v128_t v)
			
 
				 {
			
 
				 	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
			
 
				-	v128_t xr = wasm_u8x16_shr(v, 1);
			
 
				+	// TODO: use wasm_u8x16_shr when v8 fixes codegen for constant shifts
			
 
				+	v128_t xr = wasm_v128_and(wasm_u16x8_shr(v, 1), wasm_i8x16_splat(127));
			
 
				 
			
 
				 	return wasm_v128_xor(xl, xr);
			
 
				 }
			
--- a/3rdparty/meshoptimizer/src/vertexfilter.cpp
+++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp
@@ -12,10 +12,10 @@
 
				 #endif
			
 
				 
			
 
				 #ifdef SIMD_WASM
			
 
				-#define wasmx_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
			
 
				-#define wasmx_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
			
 
				-#define wasmx_unziplo_v32x4(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27)
			
 
				-#define wasmx_unziphi_v32x4(a, b) wasm_v8x16_shuffle(a, b, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31)
			
 
				+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
			
 
				+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
			
 
				+#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
			
 
				+#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
			
 
				 #endif
			
 
				 
			
 
				 namespace meshopt
			
@@ -57,7 +57,7 @@ static void decodeFilterOct(T* data, size_t count)
 
				 
			
 
				 static void decodeFilterQuat(short* data, size_t count)
			
 
				 {
			
 
				-	const float scale = 1.f / (2047.f * sqrtf(2.f));
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				 
			
 
				 	static const int order[4][4] = {
			
 
				 	    {1, 2, 3, 0},
			
@@ -68,10 +68,14 @@ static void decodeFilterQuat(short* data, size_t count)
 
				 
			
 
				 	for (size_t i = 0; i < count; ++i)
			
 
				 	{
			
 
				+		// recover scale from the high byte of the component
			
 
				+		int sf = data[i * 4 + 3] | 3;
			
 
				+		float ss = scale / float(sf);
			
 
				+
			
 
				 		// convert x/y/z to [-1..1] (scaled...)
			
 
				-		float x = float(data[i * 4 + 0]) * scale;
			
 
				-		float y = float(data[i * 4 + 1]) * scale;
			
 
				-		float z = float(data[i * 4 + 2]) * scale;
			
 
				+		float x = float(data[i * 4 + 0]) * ss;
			
 
				+		float y = float(data[i * 4 + 1]) * ss;
			
 
				+		float z = float(data[i * 4 + 2]) * ss;
			
 
				 
			
 
				 		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				 		float ww = 1.f - x * x - y * y - z * z;
			
@@ -211,7 +215,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
 
				 
			
 
				 static void decodeFilterQuatSimd(short* data, size_t count)
			
 
				 {
			
 
				-	const float scale = 1.f / (2047.f * sqrtf(2.f));
			
 
				+	const float scale = 1.f / sqrtf(2.f);
			
 
				 
			
 
				 	for (size_t i = 0; i < count; i += 4)
			
 
				 	{
			
@@ -226,11 +230,16 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
				 		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
			
 
				 		v128_t yf = wasm_i32x4_shr(q4_xy, 16);
			
 
				 		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
			
 
				+		v128_t cf = wasm_i32x4_shr(q4_zc, 16);
			
 
				+
			
 
				+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
			
 
				+		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
			
 
				+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
			
 
				 
			
 
				 		// convert x/y/z to [-1..1] (scaled...)
			
 
				-		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), wasm_f32x4_splat(scale));
			
 
				-		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), wasm_f32x4_splat(scale));
			
 
				-		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_splat(scale));
			
 
				+		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
			
 
				+		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
			
 
				+		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
			
 
				 
			
 
				 		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
			
 
				 		// note: i32x4_max_s with 0 is equivalent to f32x4_max
			
@@ -257,7 +266,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
				 		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
			
 
				 
			
 
				 		// compute component index shifted left by 4 (and moved into i32x4 slot)
			
 
				-		v128_t cm = wasm_i32x4_shl(wasm_i32x4_shr(q4_zc, 16), 4);
			
 
				+		v128_t cm = wasm_i32x4_shl(cf, 4);
			
 
				 
			
 
				 		// rotate and store
			
 
				 		uint64_t* out = (uint64_t*)&data[i * 4];