vertexcodec.cpp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  6. #define SIMD_NEON
  7. #endif
  8. #if defined(__AVX__) || defined(__SSSE3__)
  9. #define SIMD_SSE
  10. #endif
  11. #if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
  12. #undef SIMD_SSE
  13. #define SIMD_AVX
  14. #endif
  15. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
  16. #define SIMD_SSE
  17. #define SIMD_FALLBACK
  18. #include <intrin.h> // __cpuid
  19. #endif
  20. #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
  21. #define SIMD_NEON
  22. #endif
  23. // WebAssembly SIMD implementation requires a few bleeding edge intrinsics that are only available in Chrome Canary
  24. #if defined(__wasm_simd128__) && defined(__wasm_unimplemented_simd128__)
  25. #define SIMD_WASM
  26. #endif
  27. #ifdef SIMD_SSE
  28. #include <tmmintrin.h>
  29. #endif
  30. #ifdef SIMD_AVX
  31. #include <immintrin.h>
  32. #endif
  33. #ifdef SIMD_NEON
  34. #if defined(_MSC_VER) && defined(_M_ARM64)
  35. #include <arm64_neon.h>
  36. #else
  37. #include <arm_neon.h>
  38. #endif
  39. #endif
  40. #ifdef SIMD_WASM
  41. #include <wasm_simd128.h>
  42. #endif
  43. #ifndef TRACE
  44. #define TRACE 0
  45. #endif
  46. #if TRACE
  47. #include <stdio.h>
  48. #endif
  49. #ifdef SIMD_WASM
  50. #define wasmx_shuffle_v32x4(v, i, j, k, l) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * j, 4 * j + 1, 4 * j + 2, 4 * j + 3, 4 * k, 4 * k + 1, 4 * k + 2, 4 * k + 3, 4 * l, 4 * l + 1, 4 * l + 2, 4 * l + 3)
  51. #define wasmx_splat_v32x4(v, i) wasmx_shuffle_v32x4(v, i, i, i, i)
  52. #define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
  53. #define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
  54. #define wasmx_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
  55. #define wasmx_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
  56. #define wasmx_unpacklo_v64x2(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23)
  57. #define wasmx_unpackhi_v64x2(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31)
  58. #endif
  59. namespace meshopt
  60. {
  61. const unsigned char kVertexHeader = 0xa0;
  62. const size_t kVertexBlockSizeBytes = 8192;
  63. const size_t kVertexBlockMaxSize = 256;
  64. const size_t kByteGroupSize = 16;
  65. const size_t kTailMaxSize = 32;
  66. static size_t getVertexBlockSize(size_t vertex_size)
  67. {
  68. // make sure the entire block fits into the scratch buffer
  69. size_t result = kVertexBlockSizeBytes / vertex_size;
  70. // align to byte group size; we encode each byte as a byte group
  71. // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
  72. result &= ~(kByteGroupSize - 1);
  73. return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
  74. }
  75. inline unsigned char zigzag8(unsigned char v)
  76. {
  77. return ((signed char)(v) >> 7) ^ (v << 1);
  78. }
  79. inline unsigned char unzigzag8(unsigned char v)
  80. {
  81. return -(v & 1) ^ (v >> 1);
  82. }
  83. #if TRACE
  84. struct Stats
  85. {
  86. size_t size;
  87. size_t header;
  88. size_t bitg[4];
  89. size_t bitb[4];
  90. };
  91. Stats* bytestats;
  92. Stats vertexstats[256];
  93. #endif
  94. static bool encodeBytesGroupZero(const unsigned char* buffer)
  95. {
  96. for (size_t i = 0; i < kByteGroupSize; ++i)
  97. if (buffer[i])
  98. return false;
  99. return true;
  100. }
  101. static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
  102. {
  103. assert(bits >= 1 && bits <= 8);
  104. if (bits == 1)
  105. return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
  106. if (bits == 8)
  107. return kByteGroupSize;
  108. size_t result = kByteGroupSize * bits / 8;
  109. unsigned char sentinel = (1 << bits) - 1;
  110. for (size_t i = 0; i < kByteGroupSize; ++i)
  111. result += buffer[i] >= sentinel;
  112. return result;
  113. }
  114. static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
  115. {
  116. assert(bits >= 1 && bits <= 8);
  117. if (bits == 1)
  118. return data;
  119. if (bits == 8)
  120. {
  121. memcpy(data, buffer, kByteGroupSize);
  122. return data + kByteGroupSize;
  123. }
  124. size_t byte_size = 8 / bits;
  125. assert(kByteGroupSize % byte_size == 0);
  126. // fixed portion: bits bits for each value
  127. // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
  128. unsigned char sentinel = (1 << bits) - 1;
  129. for (size_t i = 0; i < kByteGroupSize; i += byte_size)
  130. {
  131. unsigned char byte = 0;
  132. for (size_t k = 0; k < byte_size; ++k)
  133. {
  134. unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
  135. byte <<= bits;
  136. byte |= enc;
  137. }
  138. *data++ = byte;
  139. }
  140. for (size_t i = 0; i < kByteGroupSize; ++i)
  141. {
  142. if (buffer[i] >= sentinel)
  143. {
  144. *data++ = buffer[i];
  145. }
  146. }
  147. return data;
  148. }
  149. static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
  150. {
  151. assert(buffer_size % kByteGroupSize == 0);
  152. unsigned char* header = data;
  153. // round number of groups to 4 to get number of header bytes
  154. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  155. if (size_t(data_end - data) < header_size)
  156. return 0;
  157. data += header_size;
  158. memset(header, 0, header_size);
  159. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  160. {
  161. if (size_t(data_end - data) < kTailMaxSize)
  162. return 0;
  163. int best_bits = 8;
  164. size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
  165. for (int bits = 1; bits < 8; bits *= 2)
  166. {
  167. size_t size = encodeBytesGroupMeasure(buffer + i, bits);
  168. if (size < best_size)
  169. {
  170. best_bits = bits;
  171. best_size = size;
  172. }
  173. }
  174. int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
  175. assert((1 << bitslog2) == best_bits);
  176. size_t header_offset = i / kByteGroupSize;
  177. header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
  178. unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
  179. assert(data + best_size == next);
  180. data = next;
  181. #if TRACE > 1
  182. bytestats->bitg[bitslog2]++;
  183. bytestats->bitb[bitslog2] += best_size;
  184. #endif
  185. }
  186. #if TRACE > 1
  187. bytestats->header += header_size;
  188. #endif
  189. return data;
  190. }
  191. static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  192. {
  193. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  194. unsigned char buffer[kVertexBlockMaxSize];
  195. assert(sizeof(buffer) % kByteGroupSize == 0);
  196. // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  197. memset(buffer, 0, sizeof(buffer));
  198. for (size_t k = 0; k < vertex_size; ++k)
  199. {
  200. size_t vertex_offset = k;
  201. unsigned char p = last_vertex[k];
  202. for (size_t i = 0; i < vertex_count; ++i)
  203. {
  204. buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
  205. p = vertex_data[vertex_offset];
  206. vertex_offset += vertex_size;
  207. }
  208. #if TRACE
  209. const unsigned char* olddata = data;
  210. bytestats = &vertexstats[k];
  211. #endif
  212. data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
  213. if (!data)
  214. return 0;
  215. #if TRACE
  216. bytestats = 0;
  217. vertexstats[k].size += data - olddata;
  218. #endif
  219. }
  220. memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
  221. return data;
  222. }
  223. #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
  224. static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
  225. {
  226. #define READ() byte = *data++
  227. #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
  228. unsigned char byte, enc, encv;
  229. const unsigned char* data_var;
  230. switch (bitslog2)
  231. {
  232. case 0:
  233. memset(buffer, 0, kByteGroupSize);
  234. return data;
  235. case 1:
  236. data_var = data + 4;
  237. // 4 groups with 4 2-bit values in each byte
  238. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  239. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  240. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  241. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  242. return data_var;
  243. case 2:
  244. data_var = data + 8;
  245. // 8 groups with 2 4-bit values in each byte
  246. READ(), NEXT(4), NEXT(4);
  247. READ(), NEXT(4), NEXT(4);
  248. READ(), NEXT(4), NEXT(4);
  249. READ(), NEXT(4), NEXT(4);
  250. READ(), NEXT(4), NEXT(4);
  251. READ(), NEXT(4), NEXT(4);
  252. READ(), NEXT(4), NEXT(4);
  253. READ(), NEXT(4), NEXT(4);
  254. return data_var;
  255. case 3:
  256. memcpy(buffer, data, kByteGroupSize);
  257. return data + kByteGroupSize;
  258. default:
  259. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  260. return data;
  261. }
  262. #undef READ
  263. #undef NEXT
  264. }
  265. static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  266. {
  267. assert(buffer_size % kByteGroupSize == 0);
  268. const unsigned char* header = data;
  269. // round number of groups to 4 to get number of header bytes
  270. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  271. if (size_t(data_end - data) < header_size)
  272. return 0;
  273. data += header_size;
  274. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  275. {
  276. if (size_t(data_end - data) < kTailMaxSize)
  277. return 0;
  278. size_t header_offset = i / kByteGroupSize;
  279. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  280. data = decodeBytesGroup(data, buffer + i, bitslog2);
  281. }
  282. return data;
  283. }
  284. static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  285. {
  286. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  287. unsigned char buffer[kVertexBlockMaxSize];
  288. unsigned char transposed[kVertexBlockSizeBytes];
  289. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  290. for (size_t k = 0; k < vertex_size; ++k)
  291. {
  292. data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
  293. if (!data)
  294. return 0;
  295. size_t vertex_offset = k;
  296. unsigned char p = last_vertex[k];
  297. for (size_t i = 0; i < vertex_count; ++i)
  298. {
  299. unsigned char v = unzigzag8(buffer[i]) + p;
  300. transposed[vertex_offset] = v;
  301. p = v;
  302. vertex_offset += vertex_size;
  303. }
  304. }
  305. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  306. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  307. return data;
  308. }
  309. #endif
  310. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  311. static unsigned char kDecodeBytesGroupShuffle[256][8];
  312. static unsigned char kDecodeBytesGroupCount[256];
  313. #ifdef EMSCRIPTEN
  314. __attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
  315. #endif
  316. static bool decodeBytesGroupBuildTables()
  317. {
  318. for (int mask = 0; mask < 256; ++mask)
  319. {
  320. unsigned char shuffle[8];
  321. unsigned char count = 0;
  322. for (int i = 0; i < 8; ++i)
  323. {
  324. int maski = (mask >> i) & 1;
  325. shuffle[i] = maski ? count : 0x80;
  326. count += (unsigned char)(maski);
  327. }
  328. memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
  329. kDecodeBytesGroupCount[mask] = count;
  330. }
  331. return true;
  332. }
  333. static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
  334. #endif
  335. #ifdef SIMD_SSE
  336. static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  337. {
  338. __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
  339. __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
  340. __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
  341. __m128i sm1r = _mm_add_epi8(sm1, sm1off);
  342. return _mm_unpacklo_epi64(sm0, sm1r);
  343. }
  344. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  345. {
  346. switch (bitslog2)
  347. {
  348. case 0:
  349. {
  350. __m128i result = _mm_setzero_si128();
  351. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  352. return data;
  353. }
  354. case 1:
  355. {
  356. #ifdef __GNUC__
  357. typedef int __attribute__((aligned(1))) unaligned_int;
  358. #else
  359. typedef int unaligned_int;
  360. #endif
  361. __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
  362. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
  363. __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
  364. __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
  365. __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
  366. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
  367. int mask16 = _mm_movemask_epi8(mask);
  368. unsigned char mask0 = (unsigned char)(mask16 & 255);
  369. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  370. __m128i shuf = decodeShuffleMask(mask0, mask1);
  371. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  372. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  373. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  374. }
  375. case 2:
  376. {
  377. __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  378. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
  379. __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
  380. __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
  381. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
  382. int mask16 = _mm_movemask_epi8(mask);
  383. unsigned char mask0 = (unsigned char)(mask16 & 255);
  384. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  385. __m128i shuf = decodeShuffleMask(mask0, mask1);
  386. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  387. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  388. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  389. }
  390. case 3:
  391. {
  392. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  393. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  394. return data + 16;
  395. }
  396. default:
  397. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  398. return data;
  399. }
  400. }
  401. #endif
  402. #ifdef SIMD_AVX
  403. static const __m128i decodeBytesGroupConfig[] = {
  404. _mm_set1_epi8(3),
  405. _mm_set1_epi8(15),
  406. _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
  407. _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
  408. };
  409. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  410. {
  411. switch (bitslog2)
  412. {
  413. case 0:
  414. {
  415. __m128i result = _mm_setzero_si128();
  416. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  417. return data;
  418. }
  419. case 1:
  420. case 2:
  421. {
  422. const unsigned char* skip = data + (bitslog2 << 2);
  423. __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  424. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
  425. __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
  426. __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
  427. __m128i selw = _mm_shuffle_epi32(selb, 0x44);
  428. __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
  429. __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
  430. __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
  431. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  432. return skip + _mm_popcnt_u32(mask16);
  433. }
  434. case 3:
  435. {
  436. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  437. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  438. return data + 16;
  439. }
  440. default:
  441. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  442. return data;
  443. }
  444. }
  445. #endif
  446. #ifdef SIMD_NEON
  447. static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
  448. {
  449. uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
  450. uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
  451. uint8x8_t r0 = vtbl1_u8(rest0, sm0);
  452. uint8x8_t r1 = vtbl1_u8(rest1, sm1);
  453. return vcombine_u8(r0, r1);
  454. }
  455. static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
  456. {
  457. static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
  458. uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
  459. uint8x16_t masked = vandq_u8(mask, byte_mask);
  460. #ifdef __aarch64__
  461. // aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
  462. mask0 = vaddv_u8(vget_low_u8(masked));
  463. mask1 = vaddv_u8(vget_high_u8(masked));
  464. #else
  465. // we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
  466. uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
  467. uint8x8_t sum2 = vpadd_u8(sum1, sum1);
  468. uint8x8_t sum3 = vpadd_u8(sum2, sum2);
  469. mask0 = vget_lane_u8(sum3, 0);
  470. mask1 = vget_lane_u8(sum3, 1);
  471. #endif
  472. }
  473. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  474. {
  475. switch (bitslog2)
  476. {
  477. case 0:
  478. {
  479. uint8x16_t result = vdupq_n_u8(0);
  480. vst1q_u8(buffer, result);
  481. return data;
  482. }
  483. case 1:
  484. {
  485. uint8x8_t sel2 = vld1_u8(data);
  486. uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
  487. uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
  488. uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
  489. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
  490. unsigned char mask0, mask1;
  491. neonMoveMask(mask, mask0, mask1);
  492. uint8x8_t rest0 = vld1_u8(data + 4);
  493. uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
  494. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  495. vst1q_u8(buffer, result);
  496. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  497. }
  498. case 2:
  499. {
  500. uint8x8_t sel4 = vld1_u8(data);
  501. uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
  502. uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
  503. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
  504. unsigned char mask0, mask1;
  505. neonMoveMask(mask, mask0, mask1);
  506. uint8x8_t rest0 = vld1_u8(data + 8);
  507. uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
  508. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  509. vst1q_u8(buffer, result);
  510. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  511. }
  512. case 3:
  513. {
  514. uint8x16_t result = vld1q_u8(data);
  515. vst1q_u8(buffer, result);
  516. return data + 16;
  517. }
  518. default:
  519. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  520. return data;
  521. }
  522. }
  523. #endif
  524. #ifdef SIMD_WASM
  525. static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  526. {
  527. // TODO: 8b buffer overrun - should we use splat or extend buffers?
  528. v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
  529. v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
  530. // TODO: we should use v8x16_load_splat
  531. v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
  532. sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  533. v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
  534. return wasmx_unpacklo_v64x2(sm0, sm1r);
  535. }
  536. static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
  537. {
  538. v128_t mask_0 = wasmx_shuffle_v32x4(mask, 0, 2, 1, 3);
  539. // TODO: when Chrome supports v128.const we can try doing vectorized and?
  540. uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
  541. uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
  542. uint64_t mask_2 = mask_1a | mask_1b;
  543. uint64_t mask_4 = mask_2 | (mask_2 >> 16);
  544. uint64_t mask_8 = mask_4 | (mask_4 >> 8);
  545. mask0 = uint8_t(mask_8);
  546. mask1 = uint8_t(mask_8 >> 32);
  547. }
  548. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  549. {
  550. unsigned char byte, enc, encv;
  551. const unsigned char* data_var;
  552. switch (bitslog2)
  553. {
  554. case 0:
  555. {
  556. v128_t result = wasm_i8x16_splat(0);
  557. wasm_v128_store(buffer, result);
  558. return data;
  559. }
  560. case 1:
  561. {
  562. // TODO: test 4b load splat
  563. v128_t sel2 = wasm_v128_load(data);
  564. v128_t rest = wasm_v128_load(data + 4);
  565. v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
  566. v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
  567. v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
  568. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
  569. unsigned char mask0, mask1;
  570. wasmMoveMask(mask, mask0, mask1);
  571. v128_t shuf = decodeShuffleMask(mask0, mask1);
  572. // TODO: test or/andnot
  573. v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
  574. wasm_v128_store(buffer, result);
  575. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  576. }
  577. case 2:
  578. {
  579. // TODO: test 8b load splat
  580. v128_t sel4 = wasm_v128_load(data);
  581. v128_t rest = wasm_v128_load(data + 8);
  582. v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
  583. v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
  584. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
  585. unsigned char mask0, mask1;
  586. wasmMoveMask(mask, mask0, mask1);
  587. v128_t shuf = decodeShuffleMask(mask0, mask1);
  588. // TODO: test or/andnot
  589. v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
  590. wasm_v128_store(buffer, result);
  591. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  592. }
  593. case 3:
  594. {
  595. v128_t result = wasm_v128_load(data);
  596. wasm_v128_store(buffer, result);
  597. return data + 16;
  598. }
  599. default:
  600. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  601. return data;
  602. }
  603. }
  604. #endif
  605. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  606. static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
  607. {
  608. __m128i t0 = _mm_unpacklo_epi8(x0, x1);
  609. __m128i t1 = _mm_unpackhi_epi8(x0, x1);
  610. __m128i t2 = _mm_unpacklo_epi8(x2, x3);
  611. __m128i t3 = _mm_unpackhi_epi8(x2, x3);
  612. x0 = _mm_unpacklo_epi16(t0, t2);
  613. x1 = _mm_unpackhi_epi16(t0, t2);
  614. x2 = _mm_unpacklo_epi16(t1, t3);
  615. x3 = _mm_unpackhi_epi16(t1, t3);
  616. }
  617. static __m128i unzigzag8(__m128i v)
  618. {
  619. __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
  620. __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
  621. return _mm_xor_si128(xl, xr);
  622. }
  623. #endif
  624. #ifdef SIMD_NEON
  625. static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
  626. {
  627. uint8x16x2_t t01 = vzipq_u8(x0, x1);
  628. uint8x16x2_t t23 = vzipq_u8(x2, x3);
  629. uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
  630. uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
  631. x0 = vreinterpretq_u8_u16(x01.val[0]);
  632. x1 = vreinterpretq_u8_u16(x01.val[1]);
  633. x2 = vreinterpretq_u8_u16(x23.val[0]);
  634. x3 = vreinterpretq_u8_u16(x23.val[1]);
  635. }
  636. static uint8x16_t unzigzag8(uint8x16_t v)
  637. {
  638. uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
  639. uint8x16_t xr = vshrq_n_u8(v, 1);
  640. return veorq_u8(xl, xr);
  641. }
  642. #endif
  643. #ifdef SIMD_WASM
  644. static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
  645. {
  646. v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
  647. v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
  648. v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
  649. v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
  650. x0 = wasmx_unpacklo_v16x8(t0, t2);
  651. x1 = wasmx_unpackhi_v16x8(t0, t2);
  652. x2 = wasmx_unpacklo_v16x8(t1, t3);
  653. x3 = wasmx_unpackhi_v16x8(t1, t3);
  654. }
  655. static v128_t unzigzag8(v128_t v)
  656. {
  657. v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
  658. v128_t xr = wasm_u8x16_shr(v, 1);
  659. return wasm_v128_xor(xl, xr);
  660. }
  661. #endif
  662. #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  663. static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  664. {
  665. assert(buffer_size % kByteGroupSize == 0);
  666. assert(kByteGroupSize == 16);
  667. const unsigned char* header = data;
  668. // round number of groups to 4 to get number of header bytes
  669. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  670. if (size_t(data_end - data) < header_size)
  671. return 0;
  672. data += header_size;
  673. size_t i = 0;
  674. // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=32b
  675. for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kTailMaxSize * 4; i += kByteGroupSize * 4)
  676. {
  677. size_t header_offset = i / kByteGroupSize;
  678. unsigned char header_byte = header[header_offset / 4];
  679. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
  680. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
  681. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
  682. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
  683. }
  684. // slow-path: process remaining groups
  685. for (; i < buffer_size; i += kByteGroupSize)
  686. {
  687. if (size_t(data_end - data) < kTailMaxSize)
  688. return 0;
  689. size_t header_offset = i / kByteGroupSize;
  690. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  691. data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
  692. }
  693. return data;
  694. }
  695. static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  696. {
  697. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  698. unsigned char buffer[kVertexBlockMaxSize * 4];
  699. unsigned char transposed[kVertexBlockSizeBytes];
  700. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  701. for (size_t k = 0; k < vertex_size; k += 4)
  702. {
  703. for (size_t j = 0; j < 4; ++j)
  704. {
  705. data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
  706. if (!data)
  707. return 0;
  708. }
  709. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  710. #define TEMP __m128i
  711. #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
  712. #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  713. #define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  714. #define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
  715. #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  716. #endif
  717. #ifdef SIMD_NEON
  718. #define TEMP uint8x8_t
  719. #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
  720. #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
  721. #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
  722. #define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
  723. #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
  724. #endif
  725. #ifdef SIMD_WASM
  726. #define TEMP v128_t
  727. #define PREP() v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun
  728. #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
  729. #define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
  730. #define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
  731. #define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
  732. #endif
  733. PREP();
  734. unsigned char* savep = transposed + k;
  735. for (size_t j = 0; j < vertex_count_aligned; j += 16)
  736. {
  737. LOAD(0);
  738. LOAD(1);
  739. LOAD(2);
  740. LOAD(3);
  741. r0 = unzigzag8(r0);
  742. r1 = unzigzag8(r1);
  743. r2 = unzigzag8(r2);
  744. r3 = unzigzag8(r3);
  745. transpose8(r0, r1, r2, r3);
  746. TEMP t0, t1, t2, t3;
  747. GRP4(0);
  748. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  749. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  750. GRP4(1);
  751. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  752. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  753. GRP4(2);
  754. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  755. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  756. GRP4(3);
  757. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  758. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  759. #undef TEMP
  760. #undef PREP
  761. #undef LOAD
  762. #undef GRP4
  763. #undef FIXD
  764. #undef SAVE
  765. }
  766. }
  767. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  768. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  769. return data;
  770. }
  771. #endif
  772. } // namespace meshopt
  773. size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
  774. {
  775. using namespace meshopt;
  776. assert(vertex_size > 0 && vertex_size <= 256);
  777. assert(vertex_size % 4 == 0);
  778. #if TRACE
  779. memset(vertexstats, 0, sizeof(vertexstats));
  780. #endif
  781. const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
  782. unsigned char* data = buffer;
  783. unsigned char* data_end = buffer + buffer_size;
  784. if (size_t(data_end - data) < 1 + vertex_size)
  785. return 0;
  786. *data++ = kVertexHeader;
  787. unsigned char last_vertex[256] = {};
  788. if (vertex_count > 0)
  789. memcpy(last_vertex, vertex_data, vertex_size);
  790. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  791. size_t vertex_offset = 0;
  792. while (vertex_offset < vertex_count)
  793. {
  794. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  795. data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  796. if (!data)
  797. return 0;
  798. vertex_offset += block_size;
  799. }
  800. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  801. if (size_t(data_end - data) < tail_size)
  802. return 0;
  803. // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
  804. if (vertex_size < kTailMaxSize)
  805. {
  806. memset(data, 0, kTailMaxSize - vertex_size);
  807. data += kTailMaxSize - vertex_size;
  808. }
  809. memcpy(data, vertex_data, vertex_size);
  810. data += vertex_size;
  811. assert(data >= buffer + tail_size);
  812. assert(data <= buffer + buffer_size);
  813. #if TRACE
  814. size_t total_size = data - buffer;
  815. for (size_t k = 0; k < vertex_size; ++k)
  816. {
  817. const Stats& vsk = vertexstats[k];
  818. printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
  819. #if TRACE > 1
  820. printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
  821. int(vsk.header),
  822. int(vsk.bitg[0]), int(vsk.bitb[0]),
  823. int(vsk.bitg[1]), int(vsk.bitb[1]),
  824. int(vsk.bitg[2]), int(vsk.bitb[2]),
  825. int(vsk.bitg[3]), int(vsk.bitb[3]));
  826. #endif
  827. printf("\n");
  828. }
  829. #endif
  830. return data - buffer;
  831. }
  832. size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
  833. {
  834. using namespace meshopt;
  835. assert(vertex_size > 0 && vertex_size <= 256);
  836. assert(vertex_size % 4 == 0);
  837. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  838. size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
  839. size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
  840. size_t vertex_block_data_size = vertex_block_size;
  841. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  842. return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
  843. }
  844. int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
  845. {
  846. using namespace meshopt;
  847. assert(vertex_size > 0 && vertex_size <= 256);
  848. assert(vertex_size % 4 == 0);
  849. const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
  850. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  851. int cpuinfo[4] = {};
  852. __cpuid(cpuinfo, 1);
  853. decode = (cpuinfo[2] & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
  854. #elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  855. decode = decodeVertexBlockSimd;
  856. #else
  857. decode = decodeVertexBlock;
  858. #endif
  859. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  860. assert(gDecodeBytesGroupInitialized);
  861. (void)gDecodeBytesGroupInitialized;
  862. #endif
  863. unsigned char* vertex_data = static_cast<unsigned char*>(destination);
  864. const unsigned char* data = buffer;
  865. const unsigned char* data_end = buffer + buffer_size;
  866. if (size_t(data_end - data) < 1 + vertex_size)
  867. return -2;
  868. if (*data++ != kVertexHeader)
  869. return -1;
  870. unsigned char last_vertex[256];
  871. memcpy(last_vertex, data_end - vertex_size, vertex_size);
  872. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  873. size_t vertex_offset = 0;
  874. while (vertex_offset < vertex_count)
  875. {
  876. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  877. data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  878. if (!data)
  879. return -2;
  880. vertex_offset += block_size;
  881. }
  882. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  883. if (size_t(data_end - data) != tail_size)
  884. return -3;
  885. return 0;
  886. }