|
@@ -12,7 +12,12 @@
|
|
|
#define SIMD_SSE
|
|
#define SIMD_SSE
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
-#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
|
|
|
|
|
|
|
+#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
|
|
|
|
|
+#undef SIMD_SSE
|
|
|
|
|
+#define SIMD_AVX
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
|
|
|
#define SIMD_SSE
|
|
#define SIMD_SSE
|
|
|
#define SIMD_FALLBACK
|
|
#define SIMD_FALLBACK
|
|
|
#include <intrin.h> // __cpuid
|
|
#include <intrin.h> // __cpuid
|
|
@@ -22,10 +27,19 @@
|
|
|
#define SIMD_NEON
|
|
#define SIMD_NEON
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
|
|
+// WebAssembly SIMD implementation requires a few bleeding edge intrinsics that are only available in Chrome Canary
|
|
|
|
|
+#if defined(__wasm_simd128__) && defined(__wasm_unimplemented_simd128__)
|
|
|
|
|
+#define SIMD_WASM
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
#ifdef SIMD_SSE
|
|
#ifdef SIMD_SSE
|
|
|
#include <tmmintrin.h>
|
|
#include <tmmintrin.h>
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
|
|
+#ifdef SIMD_AVX
|
|
|
|
|
+#include <immintrin.h>
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
#ifdef SIMD_NEON
|
|
#ifdef SIMD_NEON
|
|
|
#if defined(_MSC_VER) && defined(_M_ARM64)
|
|
#if defined(_MSC_VER) && defined(_M_ARM64)
|
|
|
#include <arm64_neon.h>
|
|
#include <arm64_neon.h>
|
|
@@ -34,6 +48,10 @@
|
|
|
#endif
|
|
#endif
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
|
|
+#ifdef SIMD_WASM
|
|
|
|
|
+#include <wasm_simd128.h>
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
#ifndef TRACE
|
|
#ifndef TRACE
|
|
|
#define TRACE 0
|
|
#define TRACE 0
|
|
|
#endif
|
|
#endif
|
|
@@ -42,6 +60,14 @@
|
|
|
#include <stdio.h>
|
|
#include <stdio.h>
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
|
|
+#ifdef SIMD_WASM
|
|
|
|
|
+#define wasm_v32x4_splat(v, i) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3)
|
|
|
|
|
+#define wasm_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
|
|
|
|
|
+#define wasm_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
|
|
|
|
|
+#define wasm_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
|
|
|
|
|
+#define wasm_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
namespace meshopt
|
|
namespace meshopt
|
|
|
{
|
|
{
|
|
|
|
|
|
|
@@ -267,7 +293,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
|
|
|
return data;
|
|
return data;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON))
|
|
|
|
|
|
|
+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
|
|
|
static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
|
{
|
|
{
|
|
|
#define READ() byte = *data++
|
|
#define READ() byte = *data++
|
|
@@ -384,7 +410,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
|
|
|
}
|
|
}
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
-#if defined(SIMD_SSE) || defined(SIMD_NEON)
|
|
|
|
|
|
|
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
|
|
|
static unsigned char kDecodeBytesGroupShuffle[256][8];
|
|
static unsigned char kDecodeBytesGroupShuffle[256][8];
|
|
|
static unsigned char kDecodeBytesGroupCount[256];
|
|
static unsigned char kDecodeBytesGroupCount[256];
|
|
|
|
|
|
|
@@ -424,27 +450,6 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
|
|
|
return _mm_unpacklo_epi64(sm0, sm1r);
|
|
return _mm_unpacklo_epi64(sm0, sm1r);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
|
|
|
|
|
-{
|
|
|
|
|
- __m128i t0 = _mm_unpacklo_epi8(x0, x1);
|
|
|
|
|
- __m128i t1 = _mm_unpackhi_epi8(x0, x1);
|
|
|
|
|
- __m128i t2 = _mm_unpacklo_epi8(x2, x3);
|
|
|
|
|
- __m128i t3 = _mm_unpackhi_epi8(x2, x3);
|
|
|
|
|
-
|
|
|
|
|
- x0 = _mm_unpacklo_epi16(t0, t2);
|
|
|
|
|
- x1 = _mm_unpackhi_epi16(t0, t2);
|
|
|
|
|
- x2 = _mm_unpacklo_epi16(t1, t3);
|
|
|
|
|
- x3 = _mm_unpackhi_epi16(t1, t3);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static __m128i unzigzag8(__m128i v)
|
|
|
|
|
-{
|
|
|
|
|
- __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
|
|
|
|
|
- __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
|
|
|
|
|
-
|
|
|
|
|
- return _mm_xor_si128(xl, xr);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
|
{
|
|
{
|
|
|
switch (bitslog2)
|
|
switch (bitslog2)
|
|
@@ -511,9 +516,66 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
|
|
|
|
|
|
|
case 3:
|
|
case 3:
|
|
|
{
|
|
{
|
|
|
- __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
|
|
|
|
|
|
|
+ __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
|
|
|
|
|
+
|
|
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
|
|
|
|
|
|
|
- __m128i result = rest;
|
|
|
|
|
|
|
+ return data + 16;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ default:
|
|
|
|
|
+ assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
|
|
|
|
|
+ return data;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+#ifdef SIMD_AVX
|
|
|
|
|
+static const __m128i decodeBytesGroupConfig[] = {
|
|
|
|
|
+ _mm_set1_epi8(3),
|
|
|
|
|
+ _mm_set1_epi8(15),
|
|
|
|
|
+ _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
|
|
|
|
|
+ _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
|
|
|
+{
|
|
|
|
|
+ switch (bitslog2)
|
|
|
|
|
+ {
|
|
|
|
|
+ case 0:
|
|
|
|
|
+ {
|
|
|
|
|
+ __m128i result = _mm_setzero_si128();
|
|
|
|
|
+
|
|
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
|
|
|
|
+
|
|
|
|
|
+ return data;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ case 1:
|
|
|
|
|
+ case 2:
|
|
|
|
|
+ {
|
|
|
|
|
+ const unsigned char* skip = data + (bitslog2 << 2);
|
|
|
|
|
+
|
|
|
|
|
+ __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
|
|
|
|
|
+ __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
|
|
|
|
|
+
|
|
|
|
|
+ __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
|
|
|
|
|
+ __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
|
|
|
|
|
+
|
|
|
|
|
+ __m128i selw = _mm_shuffle_epi32(selb, 0x44);
|
|
|
|
|
+ __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
|
|
|
|
|
+ __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
|
|
|
|
|
+
|
|
|
|
|
+ __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
|
|
|
|
|
+
|
|
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
|
|
|
|
+
|
|
|
|
|
+ return skip + _mm_popcnt_u32(mask16);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ case 3:
|
|
|
|
|
+ {
|
|
|
|
|
+ __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
|
|
|
|
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
|
|
|
|
|
|
@@ -561,28 +623,6 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m
|
|
|
#endif
|
|
#endif
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
|
|
|
|
|
-{
|
|
|
|
|
- uint8x16x2_t t01 = vzipq_u8(x0, x1);
|
|
|
|
|
- uint8x16x2_t t23 = vzipq_u8(x2, x3);
|
|
|
|
|
-
|
|
|
|
|
- uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
|
|
|
|
|
- uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
|
|
|
|
|
-
|
|
|
|
|
- x0 = vreinterpretq_u8_u16(x01.val[0]);
|
|
|
|
|
- x1 = vreinterpretq_u8_u16(x01.val[1]);
|
|
|
|
|
- x2 = vreinterpretq_u8_u16(x23.val[0]);
|
|
|
|
|
- x3 = vreinterpretq_u8_u16(x23.val[1]);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static uint8x16_t unzigzag8(uint8x16_t v)
|
|
|
|
|
-{
|
|
|
|
|
- uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
|
|
|
|
|
- uint8x16_t xr = vshrq_n_u8(v, 1);
|
|
|
|
|
-
|
|
|
|
|
- return veorq_u8(xl, xr);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
|
{
|
|
{
|
|
|
switch (bitslog2)
|
|
switch (bitslog2)
|
|
@@ -639,9 +679,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
|
|
|
|
|
|
|
case 3:
|
|
case 3:
|
|
|
{
|
|
{
|
|
|
- uint8x16_t rest = vld1q_u8(data);
|
|
|
|
|
-
|
|
|
|
|
- uint8x16_t result = rest;
|
|
|
|
|
|
|
+ uint8x16_t result = vld1q_u8(data);
|
|
|
|
|
|
|
|
vst1q_u8(buffer, result);
|
|
vst1q_u8(buffer, result);
|
|
|
|
|
|
|
@@ -655,7 +693,205 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
|
|
}
|
|
}
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
-#if defined(SIMD_SSE) || defined(SIMD_NEON)
|
|
|
|
|
|
|
+#ifdef SIMD_WASM
|
|
|
|
|
+static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
|
|
|
|
|
+{
|
|
|
|
|
+ // TODO: 8b buffer overrun - should we use splat or extend buffers?
|
|
|
|
|
+ v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
|
|
|
|
|
+ v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
|
|
|
|
|
+
|
|
|
|
|
+ // TODO: we should use v8x16_load_splat
|
|
|
|
|
+ v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
|
|
|
|
|
+ sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
|
|
|
+
|
|
|
|
|
+ v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
|
|
|
|
|
+
|
|
|
|
|
+ return wasm_v8x16_shuffle(sm0, sm1r, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
|
|
|
|
|
+{
|
|
|
|
|
+ uint64_t mbits = 0x8040201008040201ull;
|
|
|
|
|
+
|
|
|
|
|
+ uint64_t m0_8 = wasm_i64x2_extract_lane(mask, 0) & mbits;
|
|
|
|
|
+ uint64_t m1_8 = wasm_i64x2_extract_lane(mask, 1) & mbits;
|
|
|
|
|
+
|
|
|
|
|
+ uint32_t m0_4 = m0_8 | (m0_8 >> 32);
|
|
|
|
|
+ uint32_t m1_4 = m1_8 | (m1_8 >> 32);
|
|
|
|
|
+
|
|
|
|
|
+ uint16_t m0_2 = m0_4 | (m0_4 >> 16);
|
|
|
|
|
+ uint16_t m1_2 = m1_4 | (m1_4 >> 16);
|
|
|
|
|
+
|
|
|
|
|
+ mask0 = m0_2 | (m0_2 >> 8);
|
|
|
|
|
+ mask1 = m1_2 | (m1_2 >> 8);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
|
|
|
|
+{
|
|
|
|
|
+ unsigned char byte, enc, encv;
|
|
|
|
|
+ const unsigned char* data_var;
|
|
|
|
|
+
|
|
|
|
|
+ switch (bitslog2)
|
|
|
|
|
+ {
|
|
|
|
|
+ case 0:
|
|
|
|
|
+ {
|
|
|
|
|
+ v128_t result = wasm_i8x16_splat(0);
|
|
|
|
|
+
|
|
|
|
|
+ wasm_v128_store(buffer, result);
|
|
|
|
|
+
|
|
|
|
|
+ return data;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ case 1:
|
|
|
|
|
+ {
|
|
|
|
|
+ // TODO: test 4b load splat
|
|
|
|
|
+ v128_t sel2 = wasm_v128_load(data);
|
|
|
|
|
+ v128_t rest = wasm_v128_load(data + 4);
|
|
|
|
|
+
|
|
|
|
|
+ v128_t sel22 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
|
|
|
|
|
+ v128_t sel2222 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
|
|
|
|
|
+ v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
|
|
|
|
|
+
|
|
|
|
|
+ v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
|
|
|
|
|
+
|
|
|
|
|
+ if (!wasm_i8x16_any_true(mask))
|
|
|
|
|
+ {
|
|
|
|
|
+ wasm_v128_store(buffer, sel);
|
|
|
|
|
+
|
|
|
|
|
+ return data + 4;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ unsigned char mask0, mask1;
|
|
|
|
|
+ wasmMoveMask(mask, mask0, mask1);
|
|
|
|
|
+
|
|
|
|
|
+ v128_t shuf = decodeShuffleMask(mask0, mask1);
|
|
|
|
|
+
|
|
|
|
|
+ // TODO: test or/andnot
|
|
|
|
|
+ v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
|
|
|
|
|
+
|
|
|
|
|
+ wasm_v128_store(buffer, result);
|
|
|
|
|
+
|
|
|
|
|
+ return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ case 2:
|
|
|
|
|
+ {
|
|
|
|
|
+ // TODO: test 8b load splat
|
|
|
|
|
+ v128_t sel4 = wasm_v128_load(data);
|
|
|
|
|
+ v128_t rest = wasm_v128_load(data + 8);
|
|
|
|
|
+
|
|
|
|
|
+ v128_t sel44 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
|
|
|
|
|
+ v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
|
|
|
|
|
+
|
|
|
|
|
+ v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
|
|
|
|
|
+
|
|
|
|
|
+ if (!wasm_i8x16_any_true(mask))
|
|
|
|
|
+ {
|
|
|
|
|
+ wasm_v128_store(buffer, sel);
|
|
|
|
|
+
|
|
|
|
|
+ return data + 8;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ unsigned char mask0, mask1;
|
|
|
|
|
+ wasmMoveMask(mask, mask0, mask1);
|
|
|
|
|
+
|
|
|
|
|
+ v128_t shuf = decodeShuffleMask(mask0, mask1);
|
|
|
|
|
+
|
|
|
|
|
+ // TODO: test or/andnot
|
|
|
|
|
+ v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
|
|
|
|
|
+
|
|
|
|
|
+ wasm_v128_store(buffer, result);
|
|
|
|
|
+
|
|
|
|
|
+ return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ case 3:
|
|
|
|
|
+ {
|
|
|
|
|
+ v128_t result = wasm_v128_load(data);
|
|
|
|
|
+
|
|
|
|
|
+ wasm_v128_store(buffer, result);
|
|
|
|
|
+
|
|
|
|
|
+ return data + 16;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ default:
|
|
|
|
|
+ assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
|
|
|
|
|
+ return data;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
|
|
|
|
|
+static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
|
|
|
|
|
+{
|
|
|
|
|
+ __m128i t0 = _mm_unpacklo_epi8(x0, x1);
|
|
|
|
|
+ __m128i t1 = _mm_unpackhi_epi8(x0, x1);
|
|
|
|
|
+ __m128i t2 = _mm_unpacklo_epi8(x2, x3);
|
|
|
|
|
+ __m128i t3 = _mm_unpackhi_epi8(x2, x3);
|
|
|
|
|
+
|
|
|
|
|
+ x0 = _mm_unpacklo_epi16(t0, t2);
|
|
|
|
|
+ x1 = _mm_unpackhi_epi16(t0, t2);
|
|
|
|
|
+ x2 = _mm_unpacklo_epi16(t1, t3);
|
|
|
|
|
+ x3 = _mm_unpackhi_epi16(t1, t3);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static __m128i unzigzag8(__m128i v)
|
|
|
|
|
+{
|
|
|
|
|
+ __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
|
|
|
|
|
+ __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
|
|
|
|
|
+
|
|
|
|
|
+ return _mm_xor_si128(xl, xr);
|
|
|
|
|
+}
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+#ifdef SIMD_NEON
|
|
|
|
|
+static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
|
|
|
|
|
+{
|
|
|
|
|
+ uint8x16x2_t t01 = vzipq_u8(x0, x1);
|
|
|
|
|
+ uint8x16x2_t t23 = vzipq_u8(x2, x3);
|
|
|
|
|
+
|
|
|
|
|
+ uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
|
|
|
|
|
+ uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
|
|
|
|
|
+
|
|
|
|
|
+ x0 = vreinterpretq_u8_u16(x01.val[0]);
|
|
|
|
|
+ x1 = vreinterpretq_u8_u16(x01.val[1]);
|
|
|
|
|
+ x2 = vreinterpretq_u8_u16(x23.val[0]);
|
|
|
|
|
+ x3 = vreinterpretq_u8_u16(x23.val[1]);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static uint8x16_t unzigzag8(uint8x16_t v)
|
|
|
|
|
+{
|
|
|
|
|
+ uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
|
|
|
|
|
+ uint8x16_t xr = vshrq_n_u8(v, 1);
|
|
|
|
|
+
|
|
|
|
|
+ return veorq_u8(xl, xr);
|
|
|
|
|
+}
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+#ifdef SIMD_WASM
|
|
|
|
|
+static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
|
|
|
|
|
+{
|
|
|
|
|
+ v128_t t0 = wasm_unpacklo_v8x16(x0, x1);
|
|
|
|
|
+ v128_t t1 = wasm_unpackhi_v8x16(x0, x1);
|
|
|
|
|
+ v128_t t2 = wasm_unpacklo_v8x16(x2, x3);
|
|
|
|
|
+ v128_t t3 = wasm_unpackhi_v8x16(x2, x3);
|
|
|
|
|
+
|
|
|
|
|
+ x0 = wasm_unpacklo_v16x8(t0, t2);
|
|
|
|
|
+ x1 = wasm_unpackhi_v16x8(t0, t2);
|
|
|
|
|
+ x2 = wasm_unpacklo_v16x8(t1, t3);
|
|
|
|
|
+ x3 = wasm_unpackhi_v16x8(t1, t3);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static v128_t unzigzag8(v128_t v)
|
|
|
|
|
+{
|
|
|
|
|
+ v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
|
|
|
|
|
+ v128_t xr = wasm_u8x16_shr(v, 1);
|
|
|
|
|
+
|
|
|
|
|
+ return wasm_v128_xor(xl, xr);
|
|
|
|
|
+}
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
|
|
|
static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
|
|
static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
|
|
|
{
|
|
{
|
|
|
assert(buffer_size % kByteGroupSize == 0);
|
|
assert(buffer_size % kByteGroupSize == 0);
|
|
@@ -719,7 +955,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
|
|
|
return 0;
|
|
return 0;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-#ifdef SIMD_SSE
|
|
|
|
|
|
|
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
|
|
|
#define TEMP __m128i
|
|
#define TEMP __m128i
|
|
|
#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
|
|
#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
|
|
|
#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
|
|
#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
|
|
@@ -735,6 +971,15 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
|
|
|
#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
|
|
#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
|
|
|
#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
|
|
#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
|
|
|
#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
|
|
#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+#ifdef SIMD_WASM
|
|
|
|
|
+#define TEMP v128_t
|
|
|
|
|
+#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun
|
|
|
|
|
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
|
|
|
|
|
+#define GRP4(i) t0 = wasm_v32x4_splat(r##i, 0), t1 = wasm_v32x4_splat(r##i, 1), t2 = wasm_v32x4_splat(r##i, 2), t3 = wasm_v32x4_splat(r##i, 3)
|
|
|
|
|
+#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
|
|
|
|
|
+#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
PREP();
|
|
PREP();
|
|
@@ -906,14 +1151,21 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
|
|
|
int cpuinfo[4] = {};
|
|
int cpuinfo[4] = {};
|
|
|
__cpuid(cpuinfo, 1);
|
|
__cpuid(cpuinfo, 1);
|
|
|
decode = (cpuinfo[2] & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
|
|
decode = (cpuinfo[2] & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
|
|
|
-#elif defined(SIMD_SSE) || defined(SIMD_NEON)
|
|
|
|
|
|
|
+#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
|
|
|
decode = decodeVertexBlockSimd;
|
|
decode = decodeVertexBlockSimd;
|
|
|
#else
|
|
#else
|
|
|
decode = decodeVertexBlock;
|
|
decode = decodeVertexBlock;
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
-#if defined(SIMD_SSE) || defined(SIMD_NEON)
|
|
|
|
|
|
|
+#if defined(SIMD_WASM)
|
|
|
|
|
+ // TODO: workaround for https://github.com/emscripten-core/emscripten/issues/9767
|
|
|
|
|
+ if (!gDecodeBytesGroupInitialized)
|
|
|
|
|
+ gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
|
|
|
assert(gDecodeBytesGroupInitialized);
|
|
assert(gDecodeBytesGroupInitialized);
|
|
|
|
|
+ (void)gDecodeBytesGroupInitialized;
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
unsigned char* vertex_data = static_cast<unsigned char*>(destination);
|
|
unsigned char* vertex_data = static_cast<unsigned char*>(destination);
|