|
|
@@ -61,11 +61,14 @@
|
|
|
#endif
|
|
|
|
|
|
#ifdef SIMD_WASM
|
|
|
-#define wasm_v32x4_splat(v, i) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3)
|
|
|
-#define wasm_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
|
|
|
-#define wasm_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
|
|
|
-#define wasm_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
|
|
|
-#define wasm_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
|
|
|
+#define wasmx_shuffle_v32x4(v, i, j, k, l) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * j, 4 * j + 1, 4 * j + 2, 4 * j + 3, 4 * k, 4 * k + 1, 4 * k + 2, 4 * k + 3, 4 * l, 4 * l + 1, 4 * l + 2, 4 * l + 3)
|
|
|
+#define wasmx_splat_v32x4(v, i) wasmx_shuffle_v32x4(v, i, i, i, i)
|
|
|
+#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
|
|
|
+#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
|
|
|
+#define wasmx_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
|
|
|
+#define wasmx_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
|
|
|
+#define wasmx_unpacklo_v64x2(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23)
|
|
|
+#define wasmx_unpackhi_v64x2(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
#endif
|
|
|
|
|
|
namespace meshopt
|
|
|
@@ -414,6 +417,9 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
|
|
|
static unsigned char kDecodeBytesGroupShuffle[256][8];
|
|
|
static unsigned char kDecodeBytesGroupCount[256];
|
|
|
|
|
|
+#ifdef EMSCRIPTEN
|
|
|
+__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
|
|
|
+#endif
|
|
|
static bool decodeBytesGroupBuildTables()
|
|
|
{
|
|
|
for (int mask = 0; mask < 256; ++mask)
|
|
|
@@ -706,24 +712,23 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
|
|
|
|
|
|
v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
|
|
|
|
|
|
- return wasm_v8x16_shuffle(sm0, sm1r, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23);
|
|
|
+ return wasmx_unpacklo_v64x2(sm0, sm1r);
|
|
|
}
|
|
|
|
|
|
static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
|
|
|
{
|
|
|
- uint64_t mbits = 0x8040201008040201ull;
|
|
|
-
|
|
|
- uint64_t m0_8 = wasm_i64x2_extract_lane(mask, 0) & mbits;
|
|
|
- uint64_t m1_8 = wasm_i64x2_extract_lane(mask, 1) & mbits;
|
|
|
+ v128_t mask_0 = wasmx_shuffle_v32x4(mask, 0, 2, 1, 3);
|
|
|
|
|
|
- uint32_t m0_4 = m0_8 | (m0_8 >> 32);
|
|
|
- uint32_t m1_4 = m1_8 | (m1_8 >> 32);
|
|
|
+ // TODO: when Chrome supports v128.const we can try doing vectorized and?
|
|
|
+ uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
|
|
|
+ uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
|
|
|
|
|
|
- uint16_t m0_2 = m0_4 | (m0_4 >> 16);
|
|
|
- uint16_t m1_2 = m1_4 | (m1_4 >> 16);
|
|
|
+ uint64_t mask_2 = mask_1a | mask_1b;
|
|
|
+ uint64_t mask_4 = mask_2 | (mask_2 >> 16);
|
|
|
+ uint64_t mask_8 = mask_4 | (mask_4 >> 8);
|
|
|
|
|
|
- mask0 = m0_2 | (m0_2 >> 8);
|
|
|
- mask1 = m1_2 | (m1_2 >> 8);
|
|
|
+ mask0 = uint8_t(mask_8);
|
|
|
+ mask1 = uint8_t(mask_8 >> 32);
|
|
|
}
|
|
|
|
|
|
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
|
@@ -748,19 +753,12 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
|
|
v128_t sel2 = wasm_v128_load(data);
|
|
|
v128_t rest = wasm_v128_load(data + 4);
|
|
|
|
|
|
- v128_t sel22 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
|
|
|
- v128_t sel2222 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
|
|
|
+ v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
|
|
|
+ v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
|
|
|
v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
|
|
|
|
|
|
v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
|
|
|
|
|
|
- if (!wasm_i8x16_any_true(mask))
|
|
|
- {
|
|
|
- wasm_v128_store(buffer, sel);
|
|
|
-
|
|
|
- return data + 4;
|
|
|
- }
|
|
|
-
|
|
|
unsigned char mask0, mask1;
|
|
|
wasmMoveMask(mask, mask0, mask1);
|
|
|
|
|
|
@@ -780,18 +778,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
|
|
v128_t sel4 = wasm_v128_load(data);
|
|
|
v128_t rest = wasm_v128_load(data + 8);
|
|
|
|
|
|
- v128_t sel44 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
|
|
|
+ v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
|
|
|
v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
|
|
|
|
|
|
v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
|
|
|
|
|
|
- if (!wasm_i8x16_any_true(mask))
|
|
|
- {
|
|
|
- wasm_v128_store(buffer, sel);
|
|
|
-
|
|
|
- return data + 8;
|
|
|
- }
|
|
|
-
|
|
|
unsigned char mask0, mask1;
|
|
|
wasmMoveMask(mask, mask0, mask1);
|
|
|
|
|
|
@@ -871,15 +862,15 @@ static uint8x16_t unzigzag8(uint8x16_t v)
|
|
|
#ifdef SIMD_WASM
|
|
|
static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
|
|
|
{
|
|
|
- v128_t t0 = wasm_unpacklo_v8x16(x0, x1);
|
|
|
- v128_t t1 = wasm_unpackhi_v8x16(x0, x1);
|
|
|
- v128_t t2 = wasm_unpacklo_v8x16(x2, x3);
|
|
|
- v128_t t3 = wasm_unpackhi_v8x16(x2, x3);
|
|
|
-
|
|
|
- x0 = wasm_unpacklo_v16x8(t0, t2);
|
|
|
- x1 = wasm_unpackhi_v16x8(t0, t2);
|
|
|
- x2 = wasm_unpacklo_v16x8(t1, t3);
|
|
|
- x3 = wasm_unpackhi_v16x8(t1, t3);
|
|
|
+ v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
|
|
|
+ v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
|
|
|
+ v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
|
|
|
+ v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
|
|
|
+
|
|
|
+ x0 = wasmx_unpacklo_v16x8(t0, t2);
|
|
|
+ x1 = wasmx_unpackhi_v16x8(t0, t2);
|
|
|
+ x2 = wasmx_unpacklo_v16x8(t1, t3);
|
|
|
+ x3 = wasmx_unpackhi_v16x8(t1, t3);
|
|
|
}
|
|
|
|
|
|
static v128_t unzigzag8(v128_t v)
|
|
|
@@ -977,7 +968,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
|
|
|
#define TEMP v128_t
|
|
|
#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun
|
|
|
#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
|
|
|
-#define GRP4(i) t0 = wasm_v32x4_splat(r##i, 0), t1 = wasm_v32x4_splat(r##i, 1), t2 = wasm_v32x4_splat(r##i, 2), t3 = wasm_v32x4_splat(r##i, 3)
|
|
|
+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
|
|
|
#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
|
|
|
#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
|
|
|
#endif
|
|
|
@@ -1157,12 +1148,6 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
|
|
|
decode = decodeVertexBlock;
|
|
|
#endif
|
|
|
|
|
|
-#if defined(SIMD_WASM)
|
|
|
- // TODO: workaround for https://github.com/emscripten-core/emscripten/issues/9767
|
|
|
- if (!gDecodeBytesGroupInitialized)
|
|
|
- gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
|
|
|
-#endif
|
|
|
-
|
|
|
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
|
|
|
assert(gDecodeBytesGroupInitialized);
|
|
|
(void)gDecodeBytesGroupInitialized;
|