vertexcodec.cpp 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. // The block below auto-detects SIMD ISA that can be used on the target platform
  6. #ifndef MESHOPTIMIZER_NO_SIMD
  7. // The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
  8. #if defined(__AVX__) || defined(__SSSE3__)
  9. #define SIMD_SSE
  10. #endif
  11. // An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
  12. #if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
  13. #undef SIMD_SSE
  14. #define SIMD_AVX
  15. #endif
  16. // MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
  17. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
  18. #define SIMD_SSE
  19. #define SIMD_FALLBACK
  20. #endif
  21. // GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
  22. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
  23. #define SIMD_SSE
  24. #define SIMD_FALLBACK
  25. #define SIMD_TARGET __attribute__((target("ssse3")))
  26. #endif
  27. // GCC/clang define these when NEON support is available
  28. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  29. #define SIMD_NEON
  30. #endif
  31. // On MSVC, we assume that ARM builds always target NEON-capable devices
  32. #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
  33. #define SIMD_NEON
  34. #endif
  35. // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
  36. #if defined(__wasm_simd128__)
  37. #define SIMD_WASM
  38. // Prevent compiling other variant when wasm simd compilation is active
  39. #undef SIMD_NEON
  40. #undef SIMD_SSE
  41. #undef SIMD_AVX
  42. #endif
  43. #ifndef SIMD_TARGET
  44. #define SIMD_TARGET
  45. #endif
  46. // When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
  47. // We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
  48. #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
  49. #define SIMD_LATENCYOPT
  50. #endif
  51. #endif // !MESHOPTIMIZER_NO_SIMD
  52. #ifdef SIMD_SSE
  53. #include <tmmintrin.h>
  54. #endif
  55. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  56. #ifdef _MSC_VER
  57. #include <intrin.h> // __cpuid
  58. #else
  59. #include <cpuid.h> // __cpuid
  60. #endif
  61. #endif
  62. #ifdef SIMD_AVX
  63. #include <immintrin.h>
  64. #endif
  65. #ifdef SIMD_NEON
  66. #if defined(_MSC_VER) && defined(_M_ARM64)
  67. #include <arm64_neon.h>
  68. #else
  69. #include <arm_neon.h>
  70. #endif
  71. #endif
  72. #ifdef SIMD_WASM
  73. #include <wasm_simd128.h>
  74. #endif
  75. #ifdef SIMD_WASM
  76. #define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
  77. #define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
  78. #define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
  79. #define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
  80. #define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
  81. #define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
  82. #define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
  83. #endif
  84. namespace meshopt
  85. {
  86. const unsigned char kVertexHeader = 0xa0;
  87. static int gEncodeVertexVersion = 0;
  88. const size_t kVertexBlockSizeBytes = 8192;
  89. const size_t kVertexBlockMaxSize = 256;
  90. const size_t kByteGroupSize = 16;
  91. const size_t kByteGroupDecodeLimit = 24;
  92. const size_t kTailMaxSize = 32;
  93. static size_t getVertexBlockSize(size_t vertex_size)
  94. {
  95. // make sure the entire block fits into the scratch buffer
  96. size_t result = kVertexBlockSizeBytes / vertex_size;
  97. // align to byte group size; we encode each byte as a byte group
  98. // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
  99. result &= ~(kByteGroupSize - 1);
  100. return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
  101. }
  102. inline unsigned char zigzag8(unsigned char v)
  103. {
  104. return ((signed char)(v) >> 7) ^ (v << 1);
  105. }
  106. inline unsigned char unzigzag8(unsigned char v)
  107. {
  108. return -(v & 1) ^ (v >> 1);
  109. }
  110. static bool encodeBytesGroupZero(const unsigned char* buffer)
  111. {
  112. for (size_t i = 0; i < kByteGroupSize; ++i)
  113. if (buffer[i])
  114. return false;
  115. return true;
  116. }
  117. static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
  118. {
  119. assert(bits >= 1 && bits <= 8);
  120. if (bits == 1)
  121. return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
  122. if (bits == 8)
  123. return kByteGroupSize;
  124. size_t result = kByteGroupSize * bits / 8;
  125. unsigned char sentinel = (1 << bits) - 1;
  126. for (size_t i = 0; i < kByteGroupSize; ++i)
  127. result += buffer[i] >= sentinel;
  128. return result;
  129. }
  130. static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
  131. {
  132. assert(bits >= 1 && bits <= 8);
  133. if (bits == 1)
  134. return data;
  135. if (bits == 8)
  136. {
  137. memcpy(data, buffer, kByteGroupSize);
  138. return data + kByteGroupSize;
  139. }
  140. size_t byte_size = 8 / bits;
  141. assert(kByteGroupSize % byte_size == 0);
  142. // fixed portion: bits bits for each value
  143. // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
  144. unsigned char sentinel = (1 << bits) - 1;
  145. for (size_t i = 0; i < kByteGroupSize; i += byte_size)
  146. {
  147. unsigned char byte = 0;
  148. for (size_t k = 0; k < byte_size; ++k)
  149. {
  150. unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
  151. byte <<= bits;
  152. byte |= enc;
  153. }
  154. *data++ = byte;
  155. }
  156. for (size_t i = 0; i < kByteGroupSize; ++i)
  157. {
  158. if (buffer[i] >= sentinel)
  159. {
  160. *data++ = buffer[i];
  161. }
  162. }
  163. return data;
  164. }
  165. static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
  166. {
  167. assert(buffer_size % kByteGroupSize == 0);
  168. unsigned char* header = data;
  169. // round number of groups to 4 to get number of header bytes
  170. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  171. if (size_t(data_end - data) < header_size)
  172. return NULL;
  173. data += header_size;
  174. memset(header, 0, header_size);
  175. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  176. {
  177. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  178. return NULL;
  179. int best_bits = 8;
  180. size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
  181. for (int bits = 1; bits < 8; bits *= 2)
  182. {
  183. size_t size = encodeBytesGroupMeasure(buffer + i, bits);
  184. if (size < best_size)
  185. {
  186. best_bits = bits;
  187. best_size = size;
  188. }
  189. }
  190. int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
  191. assert((1 << bitslog2) == best_bits);
  192. size_t header_offset = i / kByteGroupSize;
  193. header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
  194. unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
  195. assert(data + best_size == next);
  196. data = next;
  197. }
  198. return data;
  199. }
  200. static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  201. {
  202. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  203. unsigned char buffer[kVertexBlockMaxSize];
  204. assert(sizeof(buffer) % kByteGroupSize == 0);
  205. // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  206. memset(buffer, 0, sizeof(buffer));
  207. for (size_t k = 0; k < vertex_size; ++k)
  208. {
  209. size_t vertex_offset = k;
  210. unsigned char p = last_vertex[k];
  211. for (size_t i = 0; i < vertex_count; ++i)
  212. {
  213. buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
  214. p = vertex_data[vertex_offset];
  215. vertex_offset += vertex_size;
  216. }
  217. data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
  218. if (!data)
  219. return NULL;
  220. }
  221. memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
  222. return data;
  223. }
  224. #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
  225. static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
  226. {
  227. #define READ() byte = *data++
  228. #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
  229. unsigned char byte, enc, encv;
  230. const unsigned char* data_var;
  231. switch (bitslog2)
  232. {
  233. case 0:
  234. memset(buffer, 0, kByteGroupSize);
  235. return data;
  236. case 1:
  237. data_var = data + 4;
  238. // 4 groups with 4 2-bit values in each byte
  239. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  240. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  241. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  242. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  243. return data_var;
  244. case 2:
  245. data_var = data + 8;
  246. // 8 groups with 2 4-bit values in each byte
  247. READ(), NEXT(4), NEXT(4);
  248. READ(), NEXT(4), NEXT(4);
  249. READ(), NEXT(4), NEXT(4);
  250. READ(), NEXT(4), NEXT(4);
  251. READ(), NEXT(4), NEXT(4);
  252. READ(), NEXT(4), NEXT(4);
  253. READ(), NEXT(4), NEXT(4);
  254. READ(), NEXT(4), NEXT(4);
  255. return data_var;
  256. case 3:
  257. memcpy(buffer, data, kByteGroupSize);
  258. return data + kByteGroupSize;
  259. default:
  260. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  261. return data;
  262. }
  263. #undef READ
  264. #undef NEXT
  265. }
  266. static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  267. {
  268. assert(buffer_size % kByteGroupSize == 0);
  269. const unsigned char* header = data;
  270. // round number of groups to 4 to get number of header bytes
  271. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  272. if (size_t(data_end - data) < header_size)
  273. return NULL;
  274. data += header_size;
  275. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  276. {
  277. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  278. return NULL;
  279. size_t header_offset = i / kByteGroupSize;
  280. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  281. data = decodeBytesGroup(data, buffer + i, bitslog2);
  282. }
  283. return data;
  284. }
  285. static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  286. {
  287. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  288. unsigned char buffer[kVertexBlockMaxSize];
  289. unsigned char transposed[kVertexBlockSizeBytes];
  290. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  291. for (size_t k = 0; k < vertex_size; ++k)
  292. {
  293. data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
  294. if (!data)
  295. return NULL;
  296. size_t vertex_offset = k;
  297. unsigned char p = last_vertex[k];
  298. for (size_t i = 0; i < vertex_count; ++i)
  299. {
  300. unsigned char v = unzigzag8(buffer[i]) + p;
  301. transposed[vertex_offset] = v;
  302. p = v;
  303. vertex_offset += vertex_size;
  304. }
  305. }
  306. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  307. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  308. return data;
  309. }
  310. #endif
  311. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  312. static unsigned char kDecodeBytesGroupShuffle[256][8];
  313. static unsigned char kDecodeBytesGroupCount[256];
  314. #ifdef __wasm__
  315. __attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
  316. #endif
  317. static bool
  318. decodeBytesGroupBuildTables()
  319. {
  320. for (int mask = 0; mask < 256; ++mask)
  321. {
  322. unsigned char shuffle[8];
  323. unsigned char count = 0;
  324. for (int i = 0; i < 8; ++i)
  325. {
  326. int maski = (mask >> i) & 1;
  327. shuffle[i] = maski ? count : 0x80;
  328. count += (unsigned char)(maski);
  329. }
  330. memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
  331. kDecodeBytesGroupCount[mask] = count;
  332. }
  333. return true;
  334. }
  335. static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
  336. #endif
  337. #ifdef SIMD_SSE
  338. SIMD_TARGET
  339. static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  340. {
  341. __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
  342. __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
  343. __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
  344. __m128i sm1r = _mm_add_epi8(sm1, sm1off);
  345. return _mm_unpacklo_epi64(sm0, sm1r);
  346. }
  347. SIMD_TARGET
  348. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  349. {
  350. switch (bitslog2)
  351. {
  352. case 0:
  353. {
  354. __m128i result = _mm_setzero_si128();
  355. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  356. return data;
  357. }
  358. case 1:
  359. {
  360. #ifdef __GNUC__
  361. typedef int __attribute__((aligned(1))) unaligned_int;
  362. #else
  363. typedef int unaligned_int;
  364. #endif
  365. #ifdef SIMD_LATENCYOPT
  366. unsigned int data32;
  367. memcpy(&data32, data, 4);
  368. data32 &= data32 >> 1;
  369. // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
  370. unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
  371. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  372. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  373. #endif
  374. __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
  375. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
  376. __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
  377. __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
  378. __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
  379. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
  380. int mask16 = _mm_movemask_epi8(mask);
  381. unsigned char mask0 = (unsigned char)(mask16 & 255);
  382. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  383. __m128i shuf = decodeShuffleMask(mask0, mask1);
  384. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  385. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  386. #ifdef SIMD_LATENCYOPT
  387. return data + 4 + datacnt;
  388. #else
  389. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  390. #endif
  391. }
  392. case 2:
  393. {
  394. #ifdef SIMD_LATENCYOPT
  395. unsigned long long data64;
  396. memcpy(&data64, data, 8);
  397. data64 &= data64 >> 1;
  398. data64 &= data64 >> 2;
  399. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  400. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  401. #endif
  402. __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  403. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
  404. __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
  405. __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
  406. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
  407. int mask16 = _mm_movemask_epi8(mask);
  408. unsigned char mask0 = (unsigned char)(mask16 & 255);
  409. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  410. __m128i shuf = decodeShuffleMask(mask0, mask1);
  411. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  412. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  413. #ifdef SIMD_LATENCYOPT
  414. return data + 8 + datacnt;
  415. #else
  416. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  417. #endif
  418. }
  419. case 3:
  420. {
  421. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  422. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  423. return data + 16;
  424. }
  425. default:
  426. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  427. return data;
  428. }
  429. }
  430. #endif
  431. #ifdef SIMD_AVX
  432. static const __m128i decodeBytesGroupConfig[] = {
  433. _mm_set1_epi8(3),
  434. _mm_set1_epi8(15),
  435. _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
  436. _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
  437. };
  438. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  439. {
  440. switch (bitslog2)
  441. {
  442. case 0:
  443. {
  444. __m128i result = _mm_setzero_si128();
  445. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  446. return data;
  447. }
  448. case 1:
  449. case 2:
  450. {
  451. const unsigned char* skip = data + (bitslog2 << 2);
  452. __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  453. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
  454. __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
  455. __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
  456. __m128i selw = _mm_shuffle_epi32(selb, 0x44);
  457. __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
  458. __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
  459. __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
  460. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  461. return skip + _mm_popcnt_u32(mask16);
  462. }
  463. case 3:
  464. {
  465. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  466. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  467. return data + 16;
  468. }
  469. default:
  470. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  471. return data;
  472. }
  473. }
  474. #endif
  475. #ifdef SIMD_NEON
  476. static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
  477. {
  478. uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
  479. uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
  480. uint8x8_t r0 = vtbl1_u8(rest0, sm0);
  481. uint8x8_t r1 = vtbl1_u8(rest1, sm1);
  482. return vcombine_u8(r0, r1);
  483. }
  484. static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
  485. {
  486. // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
  487. const uint64_t magic = 0x000103070f1f3f80ull;
  488. uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
  489. mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
  490. mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
  491. }
  492. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  493. {
  494. switch (bitslog2)
  495. {
  496. case 0:
  497. {
  498. uint8x16_t result = vdupq_n_u8(0);
  499. vst1q_u8(buffer, result);
  500. return data;
  501. }
  502. case 1:
  503. {
  504. #ifdef SIMD_LATENCYOPT
  505. unsigned int data32;
  506. memcpy(&data32, data, 4);
  507. data32 &= data32 >> 1;
  508. // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
  509. unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
  510. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  511. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  512. #endif
  513. uint8x8_t sel2 = vld1_u8(data);
  514. uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
  515. uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
  516. uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
  517. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
  518. unsigned char mask0, mask1;
  519. neonMoveMask(mask, mask0, mask1);
  520. uint8x8_t rest0 = vld1_u8(data + 4);
  521. uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
  522. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  523. vst1q_u8(buffer, result);
  524. #ifdef SIMD_LATENCYOPT
  525. return data + 4 + datacnt;
  526. #else
  527. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  528. #endif
  529. }
  530. case 2:
  531. {
  532. #ifdef SIMD_LATENCYOPT
  533. unsigned long long data64;
  534. memcpy(&data64, data, 8);
  535. data64 &= data64 >> 1;
  536. data64 &= data64 >> 2;
  537. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  538. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  539. #endif
  540. uint8x8_t sel4 = vld1_u8(data);
  541. uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
  542. uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
  543. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
  544. unsigned char mask0, mask1;
  545. neonMoveMask(mask, mask0, mask1);
  546. uint8x8_t rest0 = vld1_u8(data + 8);
  547. uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
  548. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  549. vst1q_u8(buffer, result);
  550. #ifdef SIMD_LATENCYOPT
  551. return data + 8 + datacnt;
  552. #else
  553. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  554. #endif
  555. }
  556. case 3:
  557. {
  558. uint8x16_t result = vld1q_u8(data);
  559. vst1q_u8(buffer, result);
  560. return data + 16;
  561. }
  562. default:
  563. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  564. return data;
  565. }
  566. }
  567. #endif
  568. #ifdef SIMD_WASM
  569. SIMD_TARGET
  570. static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  571. {
  572. v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
  573. v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
  574. v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
  575. sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  576. v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
  577. return wasmx_unpacklo_v64x2(sm0, sm1r);
  578. }
  579. SIMD_TARGET
  580. static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
  581. {
  582. // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
  583. const uint64_t magic = 0x000103070f1f3f80ull;
  584. mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
  585. mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
  586. }
  587. SIMD_TARGET
  588. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  589. {
  590. switch (bitslog2)
  591. {
  592. case 0:
  593. {
  594. v128_t result = wasm_i8x16_splat(0);
  595. wasm_v128_store(buffer, result);
  596. return data;
  597. }
  598. case 1:
  599. {
  600. v128_t sel2 = wasm_v128_load(data);
  601. v128_t rest = wasm_v128_load(data + 4);
  602. v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
  603. v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
  604. v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
  605. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
  606. unsigned char mask0, mask1;
  607. wasmMoveMask(mask, mask0, mask1);
  608. v128_t shuf = decodeShuffleMask(mask0, mask1);
  609. v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
  610. wasm_v128_store(buffer, result);
  611. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  612. }
  613. case 2:
  614. {
  615. v128_t sel4 = wasm_v128_load(data);
  616. v128_t rest = wasm_v128_load(data + 8);
  617. v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
  618. v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
  619. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
  620. unsigned char mask0, mask1;
  621. wasmMoveMask(mask, mask0, mask1);
  622. v128_t shuf = decodeShuffleMask(mask0, mask1);
  623. v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
  624. wasm_v128_store(buffer, result);
  625. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  626. }
  627. case 3:
  628. {
  629. v128_t result = wasm_v128_load(data);
  630. wasm_v128_store(buffer, result);
  631. return data + 16;
  632. }
  633. default:
  634. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  635. return data;
  636. }
  637. }
  638. #endif
  639. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  640. SIMD_TARGET
  641. static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
  642. {
  643. __m128i t0 = _mm_unpacklo_epi8(x0, x1);
  644. __m128i t1 = _mm_unpackhi_epi8(x0, x1);
  645. __m128i t2 = _mm_unpacklo_epi8(x2, x3);
  646. __m128i t3 = _mm_unpackhi_epi8(x2, x3);
  647. x0 = _mm_unpacklo_epi16(t0, t2);
  648. x1 = _mm_unpackhi_epi16(t0, t2);
  649. x2 = _mm_unpacklo_epi16(t1, t3);
  650. x3 = _mm_unpackhi_epi16(t1, t3);
  651. }
  652. SIMD_TARGET
  653. static __m128i unzigzag8(__m128i v)
  654. {
  655. __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
  656. __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
  657. return _mm_xor_si128(xl, xr);
  658. }
  659. #endif
  660. #ifdef SIMD_NEON
  661. static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
  662. {
  663. uint8x16x2_t t01 = vzipq_u8(x0, x1);
  664. uint8x16x2_t t23 = vzipq_u8(x2, x3);
  665. uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
  666. uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
  667. x0 = vreinterpretq_u8_u16(x01.val[0]);
  668. x1 = vreinterpretq_u8_u16(x01.val[1]);
  669. x2 = vreinterpretq_u8_u16(x23.val[0]);
  670. x3 = vreinterpretq_u8_u16(x23.val[1]);
  671. }
  672. static uint8x16_t unzigzag8(uint8x16_t v)
  673. {
  674. uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
  675. uint8x16_t xr = vshrq_n_u8(v, 1);
  676. return veorq_u8(xl, xr);
  677. }
  678. #endif
  679. #ifdef SIMD_WASM
  680. SIMD_TARGET
  681. static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
  682. {
  683. v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
  684. v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
  685. v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
  686. v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
  687. x0 = wasmx_unpacklo_v16x8(t0, t2);
  688. x1 = wasmx_unpackhi_v16x8(t0, t2);
  689. x2 = wasmx_unpacklo_v16x8(t1, t3);
  690. x3 = wasmx_unpackhi_v16x8(t1, t3);
  691. }
  692. SIMD_TARGET
  693. static v128_t unzigzag8(v128_t v)
  694. {
  695. v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
  696. v128_t xr = wasm_u8x16_shr(v, 1);
  697. return wasm_v128_xor(xl, xr);
  698. }
  699. #endif
  700. #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  701. SIMD_TARGET
  702. static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  703. {
  704. assert(buffer_size % kByteGroupSize == 0);
  705. assert(kByteGroupSize == 16);
  706. const unsigned char* header = data;
  707. // round number of groups to 4 to get number of header bytes
  708. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  709. if (size_t(data_end - data) < header_size)
  710. return NULL;
  711. data += header_size;
  712. size_t i = 0;
  713. // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
  714. for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
  715. {
  716. size_t header_offset = i / kByteGroupSize;
  717. unsigned char header_byte = header[header_offset / 4];
  718. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
  719. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
  720. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
  721. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
  722. }
  723. // slow-path: process remaining groups
  724. for (; i < buffer_size; i += kByteGroupSize)
  725. {
  726. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  727. return NULL;
  728. size_t header_offset = i / kByteGroupSize;
  729. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  730. data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
  731. }
  732. return data;
  733. }
  734. SIMD_TARGET
  735. static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  736. {
  737. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  738. unsigned char buffer[kVertexBlockMaxSize * 4];
  739. unsigned char transposed[kVertexBlockSizeBytes];
  740. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  741. for (size_t k = 0; k < vertex_size; k += 4)
  742. {
  743. for (size_t j = 0; j < 4; ++j)
  744. {
  745. data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
  746. if (!data)
  747. return NULL;
  748. }
  749. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  750. #define TEMP __m128i
  751. #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
  752. #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  753. #define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  754. #define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
  755. #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  756. #endif
  757. #ifdef SIMD_NEON
  758. #define TEMP uint8x8_t
  759. #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
  760. #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
  761. #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
  762. #define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
  763. #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
  764. #endif
  765. #ifdef SIMD_WASM
  766. #define TEMP v128_t
  767. #define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
  768. #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
  769. #define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
  770. #define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
  771. #define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
  772. #endif
  773. PREP();
  774. unsigned char* savep = transposed + k;
  775. for (size_t j = 0; j < vertex_count_aligned; j += 16)
  776. {
  777. LOAD(0);
  778. LOAD(1);
  779. LOAD(2);
  780. LOAD(3);
  781. r0 = unzigzag8(r0);
  782. r1 = unzigzag8(r1);
  783. r2 = unzigzag8(r2);
  784. r3 = unzigzag8(r3);
  785. transpose8(r0, r1, r2, r3);
  786. TEMP t0, t1, t2, t3;
  787. GRP4(0);
  788. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  789. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  790. GRP4(1);
  791. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  792. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  793. GRP4(2);
  794. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  795. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  796. GRP4(3);
  797. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  798. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  799. #undef TEMP
  800. #undef PREP
  801. #undef LOAD
  802. #undef GRP4
  803. #undef FIXD
  804. #undef SAVE
  805. }
  806. }
  807. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  808. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  809. return data;
  810. }
  811. #endif
  812. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  813. static unsigned int getCpuFeatures()
  814. {
  815. int cpuinfo[4] = {};
  816. #ifdef _MSC_VER
  817. __cpuid(cpuinfo, 1);
  818. #else
  819. __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
  820. #endif
  821. return cpuinfo[2];
  822. }
  823. static unsigned int cpuid = getCpuFeatures();
  824. #endif
  825. } // namespace meshopt
  826. size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
  827. {
  828. using namespace meshopt;
  829. assert(vertex_size > 0 && vertex_size <= 256);
  830. assert(vertex_size % 4 == 0);
  831. const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
  832. unsigned char* data = buffer;
  833. unsigned char* data_end = buffer + buffer_size;
  834. if (size_t(data_end - data) < 1 + vertex_size)
  835. return 0;
  836. int version = gEncodeVertexVersion;
  837. *data++ = (unsigned char)(kVertexHeader | version);
  838. unsigned char first_vertex[256] = {};
  839. if (vertex_count > 0)
  840. memcpy(first_vertex, vertex_data, vertex_size);
  841. unsigned char last_vertex[256] = {};
  842. memcpy(last_vertex, first_vertex, vertex_size);
  843. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  844. size_t vertex_offset = 0;
  845. while (vertex_offset < vertex_count)
  846. {
  847. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  848. data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  849. if (!data)
  850. return 0;
  851. vertex_offset += block_size;
  852. }
  853. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  854. if (size_t(data_end - data) < tail_size)
  855. return 0;
  856. // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
  857. if (vertex_size < kTailMaxSize)
  858. {
  859. memset(data, 0, kTailMaxSize - vertex_size);
  860. data += kTailMaxSize - vertex_size;
  861. }
  862. memcpy(data, first_vertex, vertex_size);
  863. data += vertex_size;
  864. assert(data >= buffer + tail_size);
  865. assert(data <= buffer + buffer_size);
  866. return data - buffer;
  867. }
  868. size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
  869. {
  870. using namespace meshopt;
  871. assert(vertex_size > 0 && vertex_size <= 256);
  872. assert(vertex_size % 4 == 0);
  873. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  874. size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
  875. size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
  876. size_t vertex_block_data_size = vertex_block_size;
  877. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  878. return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
  879. }
  880. void meshopt_encodeVertexVersion(int version)
  881. {
  882. assert(unsigned(version) <= 0);
  883. meshopt::gEncodeVertexVersion = version;
  884. }
  885. int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
  886. {
  887. using namespace meshopt;
  888. assert(vertex_size > 0 && vertex_size <= 256);
  889. assert(vertex_size % 4 == 0);
  890. const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
  891. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  892. decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
  893. #elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  894. decode = decodeVertexBlockSimd;
  895. #else
  896. decode = decodeVertexBlock;
  897. #endif
  898. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  899. assert(gDecodeBytesGroupInitialized);
  900. (void)gDecodeBytesGroupInitialized;
  901. #endif
  902. unsigned char* vertex_data = static_cast<unsigned char*>(destination);
  903. const unsigned char* data = buffer;
  904. const unsigned char* data_end = buffer + buffer_size;
  905. if (size_t(data_end - data) < 1 + vertex_size)
  906. return -2;
  907. unsigned char data_header = *data++;
  908. if ((data_header & 0xf0) != kVertexHeader)
  909. return -1;
  910. int version = data_header & 0x0f;
  911. if (version > 0)
  912. return -1;
  913. unsigned char last_vertex[256];
  914. memcpy(last_vertex, data_end - vertex_size, vertex_size);
  915. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  916. size_t vertex_offset = 0;
  917. while (vertex_offset < vertex_count)
  918. {
  919. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  920. data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  921. if (!data)
  922. return -2;
  923. vertex_offset += block_size;
  924. }
  925. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  926. if (size_t(data_end - data) != tail_size)
  927. return -3;
  928. return 0;
  929. }
  930. #undef SIMD_NEON
  931. #undef SIMD_SSE
  932. #undef SIMD_AVX
  933. #undef SIMD_WASM
  934. #undef SIMD_FALLBACK
  935. #undef SIMD_TARGET