vertexcodec.cpp 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. // The block below auto-detects SIMD ISA that can be used on the target platform
  6. #ifndef MESHOPTIMIZER_NO_SIMD
  7. // The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
  8. #if defined(__AVX__) || defined(__SSSE3__)
  9. #define SIMD_SSE
  10. #endif
  11. // An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
  12. #if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
  13. #undef SIMD_SSE
  14. #define SIMD_AVX
  15. #endif
  16. // MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
  17. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
  18. #define SIMD_SSE
  19. #define SIMD_FALLBACK
  20. #endif
  21. // GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
  22. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
  23. #define SIMD_SSE
  24. #define SIMD_FALLBACK
  25. #define SIMD_TARGET __attribute__((target("ssse3")))
  26. #endif
  27. // GCC/clang define these when NEON support is available
  28. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  29. #define SIMD_NEON
  30. #endif
  31. // On MSVC, we assume that ARM builds always target NEON-capable devices
  32. #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
  33. #define SIMD_NEON
  34. #endif
  35. // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
  36. #if defined(__wasm_simd128__)
  37. #define SIMD_WASM
  38. #endif
  39. #ifndef SIMD_TARGET
  40. #define SIMD_TARGET
  41. #endif
  42. #endif // !MESHOPTIMIZER_NO_SIMD
  43. #ifdef SIMD_SSE
  44. #include <tmmintrin.h>
  45. #endif
  46. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  47. #ifdef _MSC_VER
  48. #include <intrin.h> // __cpuid
  49. #else
  50. #include <cpuid.h> // __cpuid
  51. #endif
  52. #endif
  53. #ifdef SIMD_AVX
  54. #include <immintrin.h>
  55. #endif
  56. #ifdef SIMD_NEON
  57. #if defined(_MSC_VER) && defined(_M_ARM64)
  58. #include <arm64_neon.h>
  59. #else
  60. #include <arm_neon.h>
  61. #endif
  62. #endif
  63. #ifdef SIMD_WASM
  64. #include <wasm_simd128.h>
  65. #endif
  66. #ifdef SIMD_WASM
  67. #define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
  68. #define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
  69. #define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
  70. #define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
  71. #define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
  72. #define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
  73. #define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
  74. #endif
  75. namespace meshopt
  76. {
  77. const unsigned char kVertexHeader = 0xa0;
  78. static int gEncodeVertexVersion = 0;
  79. const size_t kVertexBlockSizeBytes = 8192;
  80. const size_t kVertexBlockMaxSize = 256;
  81. const size_t kByteGroupSize = 16;
  82. const size_t kByteGroupDecodeLimit = 24;
  83. const size_t kTailMaxSize = 32;
  84. static size_t getVertexBlockSize(size_t vertex_size)
  85. {
  86. // make sure the entire block fits into the scratch buffer
  87. size_t result = kVertexBlockSizeBytes / vertex_size;
  88. // align to byte group size; we encode each byte as a byte group
  89. // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
  90. result &= ~(kByteGroupSize - 1);
  91. return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
  92. }
  93. inline unsigned char zigzag8(unsigned char v)
  94. {
  95. return ((signed char)(v) >> 7) ^ (v << 1);
  96. }
  97. inline unsigned char unzigzag8(unsigned char v)
  98. {
  99. return -(v & 1) ^ (v >> 1);
  100. }
  101. static bool encodeBytesGroupZero(const unsigned char* buffer)
  102. {
  103. for (size_t i = 0; i < kByteGroupSize; ++i)
  104. if (buffer[i])
  105. return false;
  106. return true;
  107. }
  108. static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
  109. {
  110. assert(bits >= 1 && bits <= 8);
  111. if (bits == 1)
  112. return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
  113. if (bits == 8)
  114. return kByteGroupSize;
  115. size_t result = kByteGroupSize * bits / 8;
  116. unsigned char sentinel = (1 << bits) - 1;
  117. for (size_t i = 0; i < kByteGroupSize; ++i)
  118. result += buffer[i] >= sentinel;
  119. return result;
  120. }
  121. static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
  122. {
  123. assert(bits >= 1 && bits <= 8);
  124. if (bits == 1)
  125. return data;
  126. if (bits == 8)
  127. {
  128. memcpy(data, buffer, kByteGroupSize);
  129. return data + kByteGroupSize;
  130. }
  131. size_t byte_size = 8 / bits;
  132. assert(kByteGroupSize % byte_size == 0);
  133. // fixed portion: bits bits for each value
  134. // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
  135. unsigned char sentinel = (1 << bits) - 1;
  136. for (size_t i = 0; i < kByteGroupSize; i += byte_size)
  137. {
  138. unsigned char byte = 0;
  139. for (size_t k = 0; k < byte_size; ++k)
  140. {
  141. unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
  142. byte <<= bits;
  143. byte |= enc;
  144. }
  145. *data++ = byte;
  146. }
  147. for (size_t i = 0; i < kByteGroupSize; ++i)
  148. {
  149. if (buffer[i] >= sentinel)
  150. {
  151. *data++ = buffer[i];
  152. }
  153. }
  154. return data;
  155. }
  156. static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
  157. {
  158. assert(buffer_size % kByteGroupSize == 0);
  159. unsigned char* header = data;
  160. // round number of groups to 4 to get number of header bytes
  161. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  162. if (size_t(data_end - data) < header_size)
  163. return 0;
  164. data += header_size;
  165. memset(header, 0, header_size);
  166. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  167. {
  168. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  169. return 0;
  170. int best_bits = 8;
  171. size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
  172. for (int bits = 1; bits < 8; bits *= 2)
  173. {
  174. size_t size = encodeBytesGroupMeasure(buffer + i, bits);
  175. if (size < best_size)
  176. {
  177. best_bits = bits;
  178. best_size = size;
  179. }
  180. }
  181. int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
  182. assert((1 << bitslog2) == best_bits);
  183. size_t header_offset = i / kByteGroupSize;
  184. header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
  185. unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
  186. assert(data + best_size == next);
  187. data = next;
  188. }
  189. return data;
  190. }
  191. static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  192. {
  193. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  194. unsigned char buffer[kVertexBlockMaxSize];
  195. assert(sizeof(buffer) % kByteGroupSize == 0);
  196. // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  197. memset(buffer, 0, sizeof(buffer));
  198. for (size_t k = 0; k < vertex_size; ++k)
  199. {
  200. size_t vertex_offset = k;
  201. unsigned char p = last_vertex[k];
  202. for (size_t i = 0; i < vertex_count; ++i)
  203. {
  204. buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
  205. p = vertex_data[vertex_offset];
  206. vertex_offset += vertex_size;
  207. }
  208. data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
  209. if (!data)
  210. return 0;
  211. }
  212. memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
  213. return data;
  214. }
  215. #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
  216. static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
  217. {
  218. #define READ() byte = *data++
  219. #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
  220. unsigned char byte, enc, encv;
  221. const unsigned char* data_var;
  222. switch (bitslog2)
  223. {
  224. case 0:
  225. memset(buffer, 0, kByteGroupSize);
  226. return data;
  227. case 1:
  228. data_var = data + 4;
  229. // 4 groups with 4 2-bit values in each byte
  230. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  231. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  232. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  233. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  234. return data_var;
  235. case 2:
  236. data_var = data + 8;
  237. // 8 groups with 2 4-bit values in each byte
  238. READ(), NEXT(4), NEXT(4);
  239. READ(), NEXT(4), NEXT(4);
  240. READ(), NEXT(4), NEXT(4);
  241. READ(), NEXT(4), NEXT(4);
  242. READ(), NEXT(4), NEXT(4);
  243. READ(), NEXT(4), NEXT(4);
  244. READ(), NEXT(4), NEXT(4);
  245. READ(), NEXT(4), NEXT(4);
  246. return data_var;
  247. case 3:
  248. memcpy(buffer, data, kByteGroupSize);
  249. return data + kByteGroupSize;
  250. default:
  251. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  252. return data;
  253. }
  254. #undef READ
  255. #undef NEXT
  256. }
  257. static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  258. {
  259. assert(buffer_size % kByteGroupSize == 0);
  260. const unsigned char* header = data;
  261. // round number of groups to 4 to get number of header bytes
  262. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  263. if (size_t(data_end - data) < header_size)
  264. return 0;
  265. data += header_size;
  266. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  267. {
  268. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  269. return 0;
  270. size_t header_offset = i / kByteGroupSize;
  271. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  272. data = decodeBytesGroup(data, buffer + i, bitslog2);
  273. }
  274. return data;
  275. }
  276. static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  277. {
  278. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  279. unsigned char buffer[kVertexBlockMaxSize];
  280. unsigned char transposed[kVertexBlockSizeBytes];
  281. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  282. for (size_t k = 0; k < vertex_size; ++k)
  283. {
  284. data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
  285. if (!data)
  286. return 0;
  287. size_t vertex_offset = k;
  288. unsigned char p = last_vertex[k];
  289. for (size_t i = 0; i < vertex_count; ++i)
  290. {
  291. unsigned char v = unzigzag8(buffer[i]) + p;
  292. transposed[vertex_offset] = v;
  293. p = v;
  294. vertex_offset += vertex_size;
  295. }
  296. }
  297. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  298. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  299. return data;
  300. }
  301. #endif
  302. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  303. static unsigned char kDecodeBytesGroupShuffle[256][8];
  304. static unsigned char kDecodeBytesGroupCount[256];
  305. #ifdef __wasm__
  306. __attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
  307. #endif
  308. static bool
  309. decodeBytesGroupBuildTables()
  310. {
  311. for (int mask = 0; mask < 256; ++mask)
  312. {
  313. unsigned char shuffle[8];
  314. unsigned char count = 0;
  315. for (int i = 0; i < 8; ++i)
  316. {
  317. int maski = (mask >> i) & 1;
  318. shuffle[i] = maski ? count : 0x80;
  319. count += (unsigned char)(maski);
  320. }
  321. memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
  322. kDecodeBytesGroupCount[mask] = count;
  323. }
  324. return true;
  325. }
  326. static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
  327. #endif
  328. #ifdef SIMD_SSE
  329. SIMD_TARGET
  330. static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  331. {
  332. __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
  333. __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
  334. __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
  335. __m128i sm1r = _mm_add_epi8(sm1, sm1off);
  336. return _mm_unpacklo_epi64(sm0, sm1r);
  337. }
  338. SIMD_TARGET
  339. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  340. {
  341. switch (bitslog2)
  342. {
  343. case 0:
  344. {
  345. __m128i result = _mm_setzero_si128();
  346. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  347. return data;
  348. }
  349. case 1:
  350. {
  351. #ifdef __GNUC__
  352. typedef int __attribute__((aligned(1))) unaligned_int;
  353. #else
  354. typedef int unaligned_int;
  355. #endif
  356. __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
  357. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
  358. __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
  359. __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
  360. __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
  361. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
  362. int mask16 = _mm_movemask_epi8(mask);
  363. unsigned char mask0 = (unsigned char)(mask16 & 255);
  364. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  365. __m128i shuf = decodeShuffleMask(mask0, mask1);
  366. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  367. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  368. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  369. }
  370. case 2:
  371. {
  372. __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  373. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
  374. __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
  375. __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
  376. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
  377. int mask16 = _mm_movemask_epi8(mask);
  378. unsigned char mask0 = (unsigned char)(mask16 & 255);
  379. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  380. __m128i shuf = decodeShuffleMask(mask0, mask1);
  381. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  382. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  383. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  384. }
  385. case 3:
  386. {
  387. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  388. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  389. return data + 16;
  390. }
  391. default:
  392. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  393. return data;
  394. }
  395. }
  396. #endif
  397. #ifdef SIMD_AVX
  398. static const __m128i decodeBytesGroupConfig[] = {
  399. _mm_set1_epi8(3),
  400. _mm_set1_epi8(15),
  401. _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
  402. _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
  403. };
  404. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  405. {
  406. switch (bitslog2)
  407. {
  408. case 0:
  409. {
  410. __m128i result = _mm_setzero_si128();
  411. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  412. return data;
  413. }
  414. case 1:
  415. case 2:
  416. {
  417. const unsigned char* skip = data + (bitslog2 << 2);
  418. __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  419. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
  420. __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
  421. __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
  422. __m128i selw = _mm_shuffle_epi32(selb, 0x44);
  423. __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
  424. __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
  425. __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
  426. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  427. return skip + _mm_popcnt_u32(mask16);
  428. }
  429. case 3:
  430. {
  431. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  432. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  433. return data + 16;
  434. }
  435. default:
  436. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  437. return data;
  438. }
  439. }
  440. #endif
  441. #ifdef SIMD_NEON
  442. static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
  443. {
  444. uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
  445. uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
  446. uint8x8_t r0 = vtbl1_u8(rest0, sm0);
  447. uint8x8_t r1 = vtbl1_u8(rest1, sm1);
  448. return vcombine_u8(r0, r1);
  449. }
  450. static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
  451. {
  452. static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
  453. uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
  454. uint8x16_t masked = vandq_u8(mask, byte_mask);
  455. #ifdef __aarch64__
  456. // aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
  457. mask0 = vaddv_u8(vget_low_u8(masked));
  458. mask1 = vaddv_u8(vget_high_u8(masked));
  459. #else
  460. // we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
  461. uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
  462. uint8x8_t sum2 = vpadd_u8(sum1, sum1);
  463. uint8x8_t sum3 = vpadd_u8(sum2, sum2);
  464. mask0 = vget_lane_u8(sum3, 0);
  465. mask1 = vget_lane_u8(sum3, 1);
  466. #endif
  467. }
  468. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  469. {
  470. switch (bitslog2)
  471. {
  472. case 0:
  473. {
  474. uint8x16_t result = vdupq_n_u8(0);
  475. vst1q_u8(buffer, result);
  476. return data;
  477. }
  478. case 1:
  479. {
  480. uint8x8_t sel2 = vld1_u8(data);
  481. uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
  482. uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
  483. uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
  484. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
  485. unsigned char mask0, mask1;
  486. neonMoveMask(mask, mask0, mask1);
  487. uint8x8_t rest0 = vld1_u8(data + 4);
  488. uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
  489. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  490. vst1q_u8(buffer, result);
  491. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  492. }
  493. case 2:
  494. {
  495. uint8x8_t sel4 = vld1_u8(data);
  496. uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
  497. uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
  498. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
  499. unsigned char mask0, mask1;
  500. neonMoveMask(mask, mask0, mask1);
  501. uint8x8_t rest0 = vld1_u8(data + 8);
  502. uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
  503. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  504. vst1q_u8(buffer, result);
  505. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  506. }
  507. case 3:
  508. {
  509. uint8x16_t result = vld1q_u8(data);
  510. vst1q_u8(buffer, result);
  511. return data + 16;
  512. }
  513. default:
  514. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  515. return data;
  516. }
  517. }
  518. #endif
  519. #ifdef SIMD_WASM
  520. SIMD_TARGET
  521. static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  522. {
  523. v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
  524. v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
  525. v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
  526. sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  527. v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
  528. return wasmx_unpacklo_v64x2(sm0, sm1r);
  529. }
  530. SIMD_TARGET
  531. static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
  532. {
  533. v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
  534. uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
  535. uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
  536. // TODO: This can use v8x16_bitmask in the future
  537. uint64_t mask_2 = mask_1a | mask_1b;
  538. uint64_t mask_4 = mask_2 | (mask_2 >> 16);
  539. uint64_t mask_8 = mask_4 | (mask_4 >> 8);
  540. mask0 = uint8_t(mask_8);
  541. mask1 = uint8_t(mask_8 >> 32);
  542. }
  543. SIMD_TARGET
  544. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  545. {
  546. unsigned char byte, enc, encv;
  547. const unsigned char* data_var;
  548. switch (bitslog2)
  549. {
  550. case 0:
  551. {
  552. v128_t result = wasm_i8x16_splat(0);
  553. wasm_v128_store(buffer, result);
  554. return data;
  555. }
  556. case 1:
  557. {
  558. v128_t sel2 = wasm_v128_load(data);
  559. v128_t rest = wasm_v128_load(data + 4);
  560. v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
  561. v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
  562. v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
  563. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
  564. unsigned char mask0, mask1;
  565. wasmMoveMask(mask, mask0, mask1);
  566. v128_t shuf = decodeShuffleMask(mask0, mask1);
  567. v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
  568. wasm_v128_store(buffer, result);
  569. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  570. }
  571. case 2:
  572. {
  573. v128_t sel4 = wasm_v128_load(data);
  574. v128_t rest = wasm_v128_load(data + 8);
  575. v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
  576. v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
  577. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
  578. unsigned char mask0, mask1;
  579. wasmMoveMask(mask, mask0, mask1);
  580. v128_t shuf = decodeShuffleMask(mask0, mask1);
  581. v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
  582. wasm_v128_store(buffer, result);
  583. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  584. }
  585. case 3:
  586. {
  587. v128_t result = wasm_v128_load(data);
  588. wasm_v128_store(buffer, result);
  589. return data + 16;
  590. }
  591. default:
  592. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  593. return data;
  594. }
  595. }
  596. #endif
  597. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  598. SIMD_TARGET
  599. static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
  600. {
  601. __m128i t0 = _mm_unpacklo_epi8(x0, x1);
  602. __m128i t1 = _mm_unpackhi_epi8(x0, x1);
  603. __m128i t2 = _mm_unpacklo_epi8(x2, x3);
  604. __m128i t3 = _mm_unpackhi_epi8(x2, x3);
  605. x0 = _mm_unpacklo_epi16(t0, t2);
  606. x1 = _mm_unpackhi_epi16(t0, t2);
  607. x2 = _mm_unpacklo_epi16(t1, t3);
  608. x3 = _mm_unpackhi_epi16(t1, t3);
  609. }
  610. SIMD_TARGET
  611. static __m128i unzigzag8(__m128i v)
  612. {
  613. __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
  614. __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
  615. return _mm_xor_si128(xl, xr);
  616. }
  617. #endif
  618. #ifdef SIMD_NEON
  619. static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
  620. {
  621. uint8x16x2_t t01 = vzipq_u8(x0, x1);
  622. uint8x16x2_t t23 = vzipq_u8(x2, x3);
  623. uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
  624. uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
  625. x0 = vreinterpretq_u8_u16(x01.val[0]);
  626. x1 = vreinterpretq_u8_u16(x01.val[1]);
  627. x2 = vreinterpretq_u8_u16(x23.val[0]);
  628. x3 = vreinterpretq_u8_u16(x23.val[1]);
  629. }
  630. static uint8x16_t unzigzag8(uint8x16_t v)
  631. {
  632. uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
  633. uint8x16_t xr = vshrq_n_u8(v, 1);
  634. return veorq_u8(xl, xr);
  635. }
  636. #endif
  637. #ifdef SIMD_WASM
  638. SIMD_TARGET
  639. static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
  640. {
  641. v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
  642. v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
  643. v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
  644. v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
  645. x0 = wasmx_unpacklo_v16x8(t0, t2);
  646. x1 = wasmx_unpackhi_v16x8(t0, t2);
  647. x2 = wasmx_unpacklo_v16x8(t1, t3);
  648. x3 = wasmx_unpackhi_v16x8(t1, t3);
  649. }
  650. SIMD_TARGET
  651. static v128_t unzigzag8(v128_t v)
  652. {
  653. v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
  654. v128_t xr = wasm_u8x16_shr(v, 1);
  655. return wasm_v128_xor(xl, xr);
  656. }
  657. #endif
  658. #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  659. SIMD_TARGET
  660. static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  661. {
  662. assert(buffer_size % kByteGroupSize == 0);
  663. assert(kByteGroupSize == 16);
  664. const unsigned char* header = data;
  665. // round number of groups to 4 to get number of header bytes
  666. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  667. if (size_t(data_end - data) < header_size)
  668. return 0;
  669. data += header_size;
  670. size_t i = 0;
  671. // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
  672. for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
  673. {
  674. size_t header_offset = i / kByteGroupSize;
  675. unsigned char header_byte = header[header_offset / 4];
  676. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
  677. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
  678. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
  679. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
  680. }
  681. // slow-path: process remaining groups
  682. for (; i < buffer_size; i += kByteGroupSize)
  683. {
  684. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  685. return 0;
  686. size_t header_offset = i / kByteGroupSize;
  687. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  688. data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
  689. }
  690. return data;
  691. }
  692. SIMD_TARGET
  693. static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  694. {
  695. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  696. unsigned char buffer[kVertexBlockMaxSize * 4];
  697. unsigned char transposed[kVertexBlockSizeBytes];
  698. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  699. for (size_t k = 0; k < vertex_size; k += 4)
  700. {
  701. for (size_t j = 0; j < 4; ++j)
  702. {
  703. data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
  704. if (!data)
  705. return 0;
  706. }
  707. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  708. #define TEMP __m128i
  709. #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
  710. #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  711. #define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  712. #define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
  713. #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  714. #endif
  715. #ifdef SIMD_NEON
  716. #define TEMP uint8x8_t
  717. #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
  718. #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
  719. #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
  720. #define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
  721. #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
  722. #endif
  723. #ifdef SIMD_WASM
  724. #define TEMP v128_t
  725. #define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
  726. #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
  727. #define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
  728. #define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
  729. #define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
  730. #endif
  731. PREP();
  732. unsigned char* savep = transposed + k;
  733. for (size_t j = 0; j < vertex_count_aligned; j += 16)
  734. {
  735. LOAD(0);
  736. LOAD(1);
  737. LOAD(2);
  738. LOAD(3);
  739. r0 = unzigzag8(r0);
  740. r1 = unzigzag8(r1);
  741. r2 = unzigzag8(r2);
  742. r3 = unzigzag8(r3);
  743. transpose8(r0, r1, r2, r3);
  744. TEMP t0, t1, t2, t3;
  745. GRP4(0);
  746. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  747. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  748. GRP4(1);
  749. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  750. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  751. GRP4(2);
  752. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  753. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  754. GRP4(3);
  755. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  756. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  757. #undef TEMP
  758. #undef PREP
  759. #undef LOAD
  760. #undef GRP4
  761. #undef FIXD
  762. #undef SAVE
  763. }
  764. }
  765. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  766. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  767. return data;
  768. }
  769. #endif
  770. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  771. static unsigned int getCpuFeatures()
  772. {
  773. int cpuinfo[4] = {};
  774. #ifdef _MSC_VER
  775. __cpuid(cpuinfo, 1);
  776. #else
  777. __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
  778. #endif
  779. return cpuinfo[2];
  780. }
  781. unsigned int cpuid = getCpuFeatures();
  782. #endif
  783. } // namespace meshopt
  784. size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
  785. {
  786. using namespace meshopt;
  787. assert(vertex_size > 0 && vertex_size <= 256);
  788. assert(vertex_size % 4 == 0);
  789. const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
  790. unsigned char* data = buffer;
  791. unsigned char* data_end = buffer + buffer_size;
  792. if (size_t(data_end - data) < 1 + vertex_size)
  793. return 0;
  794. int version = gEncodeVertexVersion;
  795. *data++ = (unsigned char)(kVertexHeader | version);
  796. unsigned char first_vertex[256] = {};
  797. if (vertex_count > 0)
  798. memcpy(first_vertex, vertex_data, vertex_size);
  799. unsigned char last_vertex[256] = {};
  800. memcpy(last_vertex, first_vertex, vertex_size);
  801. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  802. size_t vertex_offset = 0;
  803. while (vertex_offset < vertex_count)
  804. {
  805. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  806. data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  807. if (!data)
  808. return 0;
  809. vertex_offset += block_size;
  810. }
  811. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  812. if (size_t(data_end - data) < tail_size)
  813. return 0;
  814. // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
  815. if (vertex_size < kTailMaxSize)
  816. {
  817. memset(data, 0, kTailMaxSize - vertex_size);
  818. data += kTailMaxSize - vertex_size;
  819. }
  820. memcpy(data, first_vertex, vertex_size);
  821. data += vertex_size;
  822. assert(data >= buffer + tail_size);
  823. assert(data <= buffer + buffer_size);
  824. return data - buffer;
  825. }
  826. size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
  827. {
  828. using namespace meshopt;
  829. assert(vertex_size > 0 && vertex_size <= 256);
  830. assert(vertex_size % 4 == 0);
  831. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  832. size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
  833. size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
  834. size_t vertex_block_data_size = vertex_block_size;
  835. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  836. return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
  837. }
  838. void meshopt_encodeVertexVersion(int version)
  839. {
  840. assert(unsigned(version) <= 0);
  841. meshopt::gEncodeVertexVersion = version;
  842. }
  843. int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
  844. {
  845. using namespace meshopt;
  846. assert(vertex_size > 0 && vertex_size <= 256);
  847. assert(vertex_size % 4 == 0);
  848. const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
  849. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  850. decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
  851. #elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  852. decode = decodeVertexBlockSimd;
  853. #else
  854. decode = decodeVertexBlock;
  855. #endif
  856. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  857. assert(gDecodeBytesGroupInitialized);
  858. (void)gDecodeBytesGroupInitialized;
  859. #endif
  860. unsigned char* vertex_data = static_cast<unsigned char*>(destination);
  861. const unsigned char* data = buffer;
  862. const unsigned char* data_end = buffer + buffer_size;
  863. if (size_t(data_end - data) < 1 + vertex_size)
  864. return -2;
  865. unsigned char data_header = *data++;
  866. if ((data_header & 0xf0) != kVertexHeader)
  867. return -1;
  868. int version = data_header & 0x0f;
  869. if (version > 0)
  870. return -1;
  871. unsigned char last_vertex[256];
  872. memcpy(last_vertex, data_end - vertex_size, vertex_size);
  873. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  874. size_t vertex_offset = 0;
  875. while (vertex_offset < vertex_count)
  876. {
  877. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  878. data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  879. if (!data)
  880. return -2;
  881. vertex_offset += block_size;
  882. }
  883. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  884. if (size_t(data_end - data) != tail_size)
  885. return -3;
  886. return 0;
  887. }
  888. #undef SIMD_NEON
  889. #undef SIMD_SSE
  890. #undef SIMD_AVX
  891. #undef SIMD_WASM
  892. #undef SIMD_FALLBACK
  893. #undef SIMD_TARGET