Бранимир Караџић пре 6 година
родитељ
комит
e829a18299

+ 14 - 3
3rdparty/meshoptimizer/Makefile

@@ -24,6 +24,10 @@ CFLAGS=-g -Wall -Wextra -Werror -std=c89
 CXXFLAGS=-g -Wall -Wextra -Wshadow -Wno-missing-field-initializers -Werror -std=c++98
 LDFLAGS=
 
+WASM_SOURCES=src/vertexcodec.cpp src/indexcodec.cpp
+WASM_EXPORTS=["_meshopt_decodeVertexBuffer","_meshopt_decodeIndexBuffer","_sbrk","__start"]
+WASM_FLAGS=-O3 -DNDEBUG -s EXPORTED_FUNCTIONS='$(WASM_EXPORTS)' -s ALLOW_MEMORY_GROWTH=1 -s TOTAL_STACK=24576 -s TOTAL_MEMORY=65536
+
 ifeq ($(config),iphone)
 	IPHONESDK=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk
 	CFLAGS+=-arch armv7 -arch arm64 -isysroot $(IPHONESDK)
@@ -70,10 +74,17 @@ format:
 gltfpack: $(GLTFPACK_OBJECTS) $(LIBRARY)
 	$(CXX) $^ $(LDFLAGS) -o $@
 
-js/meshopt_decoder.js: src/vertexcodec.cpp src/indexcodec.cpp
+build/decoder_base.wasm: $(WASM_SOURCES)
+	@mkdir -p build
+	emcc $^ $(WASM_FLAGS) -o $@
+
+build/decoder_simd.wasm: $(WASM_SOURCES)
 	@mkdir -p build
-	emcc $(filter %.cpp,$^) -O3 -DNDEBUG -s EXPORTED_FUNCTIONS='["_meshopt_decodeVertexBuffer", "_meshopt_decodeIndexBuffer", "_sbrk"]' -s ALLOW_MEMORY_GROWTH=1 -s TOTAL_STACK=24576 -s TOTAL_MEMORY=65536 -o build/meshopt_decoder.wasm
-	sed -i "s#\(var wasm = \)\".*\";#\\1\"$$(cat build/meshopt_decoder.wasm | base64 -w 0)\";#" $@
+	emcc $^ $(WASM_FLAGS) -o $@ -munimplemented-simd128 -mbulk-memory
+
+js/meshopt_decoder.js: build/decoder_base.wasm build/decoder_simd.wasm
+	sed -i "s#\(var wasm_base = \)\".*\";#\\1\"$$(cat build/decoder_base.wasm | hexdump -v -e '1/1 "%02X"')\";#" $@
+	sed -i "s#\(var wasm_simd = \)\".*\";#\\1\"$$(cat build/decoder_simd.wasm | hexdump -v -e '1/1 "%02X"')\";#" $@
 
 $(EXECUTABLE): $(DEMO_OBJECTS) $(LIBRARY)
 	$(CXX) $^ $(LDFLAGS) -o $@

Разлика између датотеке није приказан због своје велике величине
+ 0 - 0
3rdparty/meshoptimizer/js/meshopt_decoder.js


+ 34 - 49
3rdparty/meshoptimizer/src/vertexcodec.cpp

@@ -61,11 +61,14 @@
 #endif
 
 #ifdef SIMD_WASM
-#define wasm_v32x4_splat(v, i) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3)
-#define wasm_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
-#define wasm_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
-#define wasm_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
-#define wasm_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
+#define wasmx_shuffle_v32x4(v, i, j, k, l) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * j, 4 * j + 1, 4 * j + 2, 4 * j + 3, 4 * k, 4 * k + 1, 4 * k + 2, 4 * k + 3, 4 * l, 4 * l + 1, 4 * l + 2, 4 * l + 3)
+#define wasmx_splat_v32x4(v, i) wasmx_shuffle_v32x4(v, i, i, i, i)
+#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
+#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
+#define wasmx_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
+#define wasmx_unpacklo_v64x2(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23)
+#define wasmx_unpackhi_v64x2(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31)
 #endif
 
 namespace meshopt
@@ -414,6 +417,9 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
 static unsigned char kDecodeBytesGroupShuffle[256][8];
 static unsigned char kDecodeBytesGroupCount[256];
 
+#ifdef EMSCRIPTEN
+__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
+#endif
 static bool decodeBytesGroupBuildTables()
 {
 	for (int mask = 0; mask < 256; ++mask)
@@ -706,24 +712,23 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 
 	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
 
-	return wasm_v8x16_shuffle(sm0, sm1r, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23);
+	return wasmx_unpacklo_v64x2(sm0, sm1r);
 }
 
 static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
 {
-	uint64_t mbits = 0x8040201008040201ull;
-
-	uint64_t m0_8 = wasm_i64x2_extract_lane(mask, 0) & mbits;
-	uint64_t m1_8 = wasm_i64x2_extract_lane(mask, 1) & mbits;
+	v128_t mask_0 = wasmx_shuffle_v32x4(mask, 0, 2, 1, 3);
 
-	uint32_t m0_4 = m0_8 | (m0_8 >> 32);
-	uint32_t m1_4 = m1_8 | (m1_8 >> 32);
+	// TODO: when Chrome supports v128.const we can try doing vectorized and?
+	uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
+	uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
 
-	uint16_t m0_2 = m0_4 | (m0_4 >> 16);
-	uint16_t m1_2 = m1_4 | (m1_4 >> 16);
+	uint64_t mask_2 = mask_1a | mask_1b;
+	uint64_t mask_4 = mask_2 | (mask_2 >> 16);
+	uint64_t mask_8 = mask_4 | (mask_4 >> 8);
 
-	mask0 = m0_2 | (m0_2 >> 8);
-	mask1 = m1_2 | (m1_2 >> 8);
+	mask0 = uint8_t(mask_8);
+	mask1 = uint8_t(mask_8 >> 32);
 }
 
 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
@@ -748,19 +753,12 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		v128_t sel2 = wasm_v128_load(data);
 		v128_t rest = wasm_v128_load(data + 4);
 
-		v128_t sel22 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
-		v128_t sel2222 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
+		v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
+		v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
 		v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
 
 		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
 
-		if (!wasm_i8x16_any_true(mask))
-		{
-			wasm_v128_store(buffer, sel);
-
-			return data + 4;
-		}
-
 		unsigned char mask0, mask1;
 		wasmMoveMask(mask, mask0, mask1);
 
@@ -780,18 +778,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		v128_t sel4 = wasm_v128_load(data);
 		v128_t rest = wasm_v128_load(data + 8);
 
-		v128_t sel44 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
+		v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
 		v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
 
 		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
 
-		if (!wasm_i8x16_any_true(mask))
-		{
-			wasm_v128_store(buffer, sel);
-
-			return data + 8;
-		}
-
 		unsigned char mask0, mask1;
 		wasmMoveMask(mask, mask0, mask1);
 
@@ -871,15 +862,15 @@ static uint8x16_t unzigzag8(uint8x16_t v)
 #ifdef SIMD_WASM
 static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
 {
-	v128_t t0 = wasm_unpacklo_v8x16(x0, x1);
-	v128_t t1 = wasm_unpackhi_v8x16(x0, x1);
-	v128_t t2 = wasm_unpacklo_v8x16(x2, x3);
-	v128_t t3 = wasm_unpackhi_v8x16(x2, x3);
-
-	x0 = wasm_unpacklo_v16x8(t0, t2);
-	x1 = wasm_unpackhi_v16x8(t0, t2);
-	x2 = wasm_unpacklo_v16x8(t1, t3);
-	x3 = wasm_unpackhi_v16x8(t1, t3);
+	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
+	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
+	v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
+	v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
+
+	x0 = wasmx_unpacklo_v16x8(t0, t2);
+	x1 = wasmx_unpackhi_v16x8(t0, t2);
+	x2 = wasmx_unpacklo_v16x8(t1, t3);
+	x3 = wasmx_unpackhi_v16x8(t1, t3);
 }
 
 static v128_t unzigzag8(v128_t v)
@@ -977,7 +968,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 #define TEMP v128_t
 #define PREP() v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun
 #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = wasm_v32x4_splat(r##i, 0), t1 = wasm_v32x4_splat(r##i, 1), t2 = wasm_v32x4_splat(r##i, 2), t3 = wasm_v32x4_splat(r##i, 3)
+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
 #define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
 #define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
 #endif
@@ -1157,12 +1148,6 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	decode = decodeVertexBlock;
 #endif
 
-#if defined(SIMD_WASM)
-	// TODO: workaround for https://github.com/emscripten-core/emscripten/issues/9767
-	if (!gDecodeBytesGroupInitialized)
-		gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
-#endif
-
 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
 	assert(gDecodeBytesGroupInitialized);
 	(void)gDecodeBytesGroupInitialized;

Неке датотеке нису приказане због велике количине промена