vertexcodec.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  6. #define SIMD_NEON
  7. #endif
  8. #if defined(__AVX__) || defined(__SSSE3__)
  9. #define SIMD_SSE
  10. #endif
  11. #if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
  12. #define SIMD_SSE
  13. #define SIMD_FALLBACK
  14. #include <intrin.h> // __cpuid
  15. #endif
  16. #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
  17. #define SIMD_NEON
  18. #endif
  19. #ifdef SIMD_SSE
  20. #include <tmmintrin.h>
  21. #endif
  22. #ifdef SIMD_NEON
  23. #if defined(_MSC_VER) && defined(_M_ARM64)
  24. #include <arm64_neon.h>
  25. #else
  26. #include <arm_neon.h>
  27. #endif
  28. #endif
  29. #ifndef TRACE
  30. #define TRACE 0
  31. #endif
  32. #if TRACE
  33. #include <stdio.h>
  34. #endif
  35. namespace meshopt
  36. {
  37. const unsigned char kVertexHeader = 0xa0;
  38. const size_t kVertexBlockSizeBytes = 8192;
  39. const size_t kVertexBlockMaxSize = 256;
  40. const size_t kByteGroupSize = 16;
  41. const size_t kTailMaxSize = 32;
  42. static size_t getVertexBlockSize(size_t vertex_size)
  43. {
  44. // make sure the entire block fits into the scratch buffer
  45. size_t result = kVertexBlockSizeBytes / vertex_size;
  46. // align to byte group size; we encode each byte as a byte group
  47. // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
  48. result &= ~(kByteGroupSize - 1);
  49. return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
  50. }
  51. inline unsigned char zigzag8(unsigned char v)
  52. {
  53. return ((signed char)(v) >> 7) ^ (v << 1);
  54. }
  55. inline unsigned char unzigzag8(unsigned char v)
  56. {
  57. return -(v & 1) ^ (v >> 1);
  58. }
  59. #if TRACE
  60. struct Stats
  61. {
  62. size_t size;
  63. size_t header;
  64. size_t bitg[4];
  65. size_t bitb[4];
  66. };
  67. Stats* bytestats;
  68. Stats vertexstats[256];
  69. #endif
  70. static bool encodeBytesGroupZero(const unsigned char* buffer)
  71. {
  72. for (size_t i = 0; i < kByteGroupSize; ++i)
  73. if (buffer[i])
  74. return false;
  75. return true;
  76. }
  77. static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
  78. {
  79. assert(bits >= 1 && bits <= 8);
  80. if (bits == 1)
  81. return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
  82. if (bits == 8)
  83. return kByteGroupSize;
  84. size_t result = kByteGroupSize * bits / 8;
  85. unsigned char sentinel = (1 << bits) - 1;
  86. for (size_t i = 0; i < kByteGroupSize; ++i)
  87. result += buffer[i] >= sentinel;
  88. return result;
  89. }
  90. static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
  91. {
  92. assert(bits >= 1 && bits <= 8);
  93. if (bits == 1)
  94. return data;
  95. if (bits == 8)
  96. {
  97. memcpy(data, buffer, kByteGroupSize);
  98. return data + kByteGroupSize;
  99. }
  100. size_t byte_size = 8 / bits;
  101. assert(kByteGroupSize % byte_size == 0);
  102. // fixed portion: bits bits for each value
  103. // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
  104. unsigned char sentinel = (1 << bits) - 1;
  105. for (size_t i = 0; i < kByteGroupSize; i += byte_size)
  106. {
  107. unsigned char byte = 0;
  108. for (size_t k = 0; k < byte_size; ++k)
  109. {
  110. unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
  111. byte <<= bits;
  112. byte |= enc;
  113. }
  114. *data++ = byte;
  115. }
  116. for (size_t i = 0; i < kByteGroupSize; ++i)
  117. {
  118. if (buffer[i] >= sentinel)
  119. {
  120. *data++ = buffer[i];
  121. }
  122. }
  123. return data;
  124. }
  125. static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
  126. {
  127. assert(buffer_size % kByteGroupSize == 0);
  128. unsigned char* header = data;
  129. // round number of groups to 4 to get number of header bytes
  130. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  131. if (size_t(data_end - data) < header_size)
  132. return 0;
  133. data += header_size;
  134. memset(header, 0, header_size);
  135. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  136. {
  137. if (size_t(data_end - data) < kTailMaxSize)
  138. return 0;
  139. int best_bits = 8;
  140. size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
  141. for (int bits = 1; bits < 8; bits *= 2)
  142. {
  143. size_t size = encodeBytesGroupMeasure(buffer + i, bits);
  144. if (size < best_size)
  145. {
  146. best_bits = bits;
  147. best_size = size;
  148. }
  149. }
  150. int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
  151. assert((1 << bitslog2) == best_bits);
  152. size_t header_offset = i / kByteGroupSize;
  153. header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
  154. unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
  155. assert(data + best_size == next);
  156. data = next;
  157. #if TRACE > 1
  158. bytestats->bitg[bitslog2]++;
  159. bytestats->bitb[bitslog2] += best_size;
  160. #endif
  161. }
  162. #if TRACE > 1
  163. bytestats->header += header_size;
  164. #endif
  165. return data;
  166. }
  167. static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  168. {
  169. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  170. unsigned char buffer[kVertexBlockMaxSize];
  171. assert(sizeof(buffer) % kByteGroupSize == 0);
  172. // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  173. memset(buffer, 0, sizeof(buffer));
  174. for (size_t k = 0; k < vertex_size; ++k)
  175. {
  176. size_t vertex_offset = k;
  177. unsigned char p = last_vertex[k];
  178. for (size_t i = 0; i < vertex_count; ++i)
  179. {
  180. buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
  181. p = vertex_data[vertex_offset];
  182. vertex_offset += vertex_size;
  183. }
  184. #if TRACE
  185. const unsigned char* olddata = data;
  186. bytestats = &vertexstats[k];
  187. #endif
  188. data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
  189. if (!data)
  190. return 0;
  191. #if TRACE
  192. bytestats = 0;
  193. vertexstats[k].size += data - olddata;
  194. #endif
  195. }
  196. memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
  197. return data;
  198. }
  199. #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON))
  200. static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
  201. {
  202. #define READ() byte = *data++
  203. #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
  204. unsigned char byte, enc, encv;
  205. const unsigned char* data_var;
  206. switch (bitslog2)
  207. {
  208. case 0:
  209. memset(buffer, 0, kByteGroupSize);
  210. return data;
  211. case 1:
  212. data_var = data + 4;
  213. // 4 groups with 4 2-bit values in each byte
  214. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  215. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  216. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  217. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  218. return data_var;
  219. case 2:
  220. data_var = data + 8;
  221. // 8 groups with 2 4-bit values in each byte
  222. READ(), NEXT(4), NEXT(4);
  223. READ(), NEXT(4), NEXT(4);
  224. READ(), NEXT(4), NEXT(4);
  225. READ(), NEXT(4), NEXT(4);
  226. READ(), NEXT(4), NEXT(4);
  227. READ(), NEXT(4), NEXT(4);
  228. READ(), NEXT(4), NEXT(4);
  229. READ(), NEXT(4), NEXT(4);
  230. return data_var;
  231. case 3:
  232. memcpy(buffer, data, kByteGroupSize);
  233. return data + kByteGroupSize;
  234. default:
  235. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  236. return data;
  237. }
  238. #undef READ
  239. #undef NEXT
  240. }
  241. static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  242. {
  243. assert(buffer_size % kByteGroupSize == 0);
  244. const unsigned char* header = data;
  245. // round number of groups to 4 to get number of header bytes
  246. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  247. if (size_t(data_end - data) < header_size)
  248. return 0;
  249. data += header_size;
  250. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  251. {
  252. if (size_t(data_end - data) < kTailMaxSize)
  253. return 0;
  254. size_t header_offset = i / kByteGroupSize;
  255. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  256. data = decodeBytesGroup(data, buffer + i, bitslog2);
  257. }
  258. return data;
  259. }
  260. static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  261. {
  262. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  263. unsigned char buffer[kVertexBlockMaxSize];
  264. unsigned char transposed[kVertexBlockSizeBytes];
  265. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  266. for (size_t k = 0; k < vertex_size; ++k)
  267. {
  268. data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
  269. if (!data)
  270. return 0;
  271. size_t vertex_offset = k;
  272. unsigned char p = last_vertex[k];
  273. for (size_t i = 0; i < vertex_count; ++i)
  274. {
  275. unsigned char v = unzigzag8(buffer[i]) + p;
  276. transposed[vertex_offset] = v;
  277. p = v;
  278. vertex_offset += vertex_size;
  279. }
  280. }
  281. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  282. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  283. return data;
  284. }
  285. #endif
  286. #if defined(SIMD_SSE) || defined(SIMD_NEON)
  287. static unsigned char kDecodeBytesGroupShuffle[256][8];
  288. static unsigned char kDecodeBytesGroupCount[256];
  289. static bool decodeBytesGroupBuildTables()
  290. {
  291. for (int mask = 0; mask < 256; ++mask)
  292. {
  293. unsigned char shuffle[8];
  294. unsigned char count = 0;
  295. for (int i = 0; i < 8; ++i)
  296. {
  297. int maski = (mask >> i) & 1;
  298. shuffle[i] = maski ? count : 0x80;
  299. count += (unsigned char)(maski);
  300. }
  301. memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
  302. kDecodeBytesGroupCount[mask] = count;
  303. }
  304. return true;
  305. }
  306. static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
  307. #endif
  308. #ifdef SIMD_SSE
  309. static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  310. {
  311. __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
  312. __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
  313. __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
  314. __m128i sm1r = _mm_add_epi8(sm1, sm1off);
  315. return _mm_unpacklo_epi64(sm0, sm1r);
  316. }
  317. static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
  318. {
  319. __m128i t0 = _mm_unpacklo_epi8(x0, x1);
  320. __m128i t1 = _mm_unpackhi_epi8(x0, x1);
  321. __m128i t2 = _mm_unpacklo_epi8(x2, x3);
  322. __m128i t3 = _mm_unpackhi_epi8(x2, x3);
  323. x0 = _mm_unpacklo_epi16(t0, t2);
  324. x1 = _mm_unpackhi_epi16(t0, t2);
  325. x2 = _mm_unpacklo_epi16(t1, t3);
  326. x3 = _mm_unpackhi_epi16(t1, t3);
  327. }
  328. static __m128i unzigzag8(__m128i v)
  329. {
  330. __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
  331. __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
  332. return _mm_xor_si128(xl, xr);
  333. }
  334. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  335. {
  336. switch (bitslog2)
  337. {
  338. case 0:
  339. {
  340. __m128i result = _mm_setzero_si128();
  341. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  342. return data;
  343. }
  344. case 1:
  345. {
  346. #ifdef __GNUC__
  347. typedef int __attribute__((aligned(1))) unaligned_int;
  348. #else
  349. typedef int unaligned_int;
  350. #endif
  351. __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
  352. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
  353. __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
  354. __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
  355. __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
  356. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
  357. int mask16 = _mm_movemask_epi8(mask);
  358. unsigned char mask0 = (unsigned char)(mask16 & 255);
  359. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  360. __m128i shuf = decodeShuffleMask(mask0, mask1);
  361. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  362. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  363. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  364. }
  365. case 2:
  366. {
  367. __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  368. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
  369. __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
  370. __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
  371. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
  372. int mask16 = _mm_movemask_epi8(mask);
  373. unsigned char mask0 = (unsigned char)(mask16 & 255);
  374. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  375. __m128i shuf = decodeShuffleMask(mask0, mask1);
  376. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  377. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  378. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  379. }
  380. case 3:
  381. {
  382. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  383. __m128i result = rest;
  384. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  385. return data + 16;
  386. }
  387. default:
  388. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  389. return data;
  390. }
  391. }
  392. #endif
  393. #ifdef SIMD_NEON
  394. static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
  395. {
  396. uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
  397. uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
  398. uint8x8_t r0 = vtbl1_u8(rest0, sm0);
  399. uint8x8_t r1 = vtbl1_u8(rest1, sm1);
  400. return vcombine_u8(r0, r1);
  401. }
  402. static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
  403. {
  404. static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
  405. uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
  406. uint8x16_t masked = vandq_u8(mask, byte_mask);
  407. #ifdef __aarch64__
  408. // aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
  409. mask0 = vaddv_u8(vget_low_u8(masked));
  410. mask1 = vaddv_u8(vget_high_u8(masked));
  411. #else
  412. // we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
  413. uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
  414. uint8x8_t sum2 = vpadd_u8(sum1, sum1);
  415. uint8x8_t sum3 = vpadd_u8(sum2, sum2);
  416. mask0 = vget_lane_u8(sum3, 0);
  417. mask1 = vget_lane_u8(sum3, 1);
  418. #endif
  419. }
  420. static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
  421. {
  422. uint8x16x2_t t01 = vzipq_u8(x0, x1);
  423. uint8x16x2_t t23 = vzipq_u8(x2, x3);
  424. uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
  425. uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
  426. x0 = vreinterpretq_u8_u16(x01.val[0]);
  427. x1 = vreinterpretq_u8_u16(x01.val[1]);
  428. x2 = vreinterpretq_u8_u16(x23.val[0]);
  429. x3 = vreinterpretq_u8_u16(x23.val[1]);
  430. }
  431. static uint8x16_t unzigzag8(uint8x16_t v)
  432. {
  433. uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
  434. uint8x16_t xr = vshrq_n_u8(v, 1);
  435. return veorq_u8(xl, xr);
  436. }
  437. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  438. {
  439. switch (bitslog2)
  440. {
  441. case 0:
  442. {
  443. uint8x16_t result = vdupq_n_u8(0);
  444. vst1q_u8(buffer, result);
  445. return data;
  446. }
  447. case 1:
  448. {
  449. uint8x8_t sel2 = vld1_u8(data);
  450. uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
  451. uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
  452. uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
  453. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
  454. unsigned char mask0, mask1;
  455. neonMoveMask(mask, mask0, mask1);
  456. uint8x8_t rest0 = vld1_u8(data + 4);
  457. uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
  458. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  459. vst1q_u8(buffer, result);
  460. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  461. }
  462. case 2:
  463. {
  464. uint8x8_t sel4 = vld1_u8(data);
  465. uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
  466. uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
  467. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
  468. unsigned char mask0, mask1;
  469. neonMoveMask(mask, mask0, mask1);
  470. uint8x8_t rest0 = vld1_u8(data + 8);
  471. uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
  472. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  473. vst1q_u8(buffer, result);
  474. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  475. }
  476. case 3:
  477. {
  478. uint8x16_t rest = vld1q_u8(data);
  479. uint8x16_t result = rest;
  480. vst1q_u8(buffer, result);
  481. return data + 16;
  482. }
  483. default:
  484. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  485. return data;
  486. }
  487. }
  488. #endif
  489. #if defined(SIMD_SSE) || defined(SIMD_NEON)
  490. static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  491. {
  492. assert(buffer_size % kByteGroupSize == 0);
  493. assert(kByteGroupSize == 16);
  494. const unsigned char* header = data;
  495. // round number of groups to 4 to get number of header bytes
  496. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  497. if (size_t(data_end - data) < header_size)
  498. return 0;
  499. data += header_size;
  500. size_t i = 0;
  501. // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=32b
  502. for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kTailMaxSize * 4; i += kByteGroupSize * 4)
  503. {
  504. size_t header_offset = i / kByteGroupSize;
  505. unsigned char header_byte = header[header_offset / 4];
  506. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
  507. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
  508. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
  509. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
  510. }
  511. // slow-path: process remaining groups
  512. for (; i < buffer_size; i += kByteGroupSize)
  513. {
  514. if (size_t(data_end - data) < kTailMaxSize)
  515. return 0;
  516. size_t header_offset = i / kByteGroupSize;
  517. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  518. data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
  519. }
  520. return data;
  521. }
  522. static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  523. {
  524. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  525. unsigned char buffer[kVertexBlockMaxSize * 4];
  526. unsigned char transposed[kVertexBlockSizeBytes];
  527. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  528. for (size_t k = 0; k < vertex_size; k += 4)
  529. {
  530. for (size_t j = 0; j < 4; ++j)
  531. {
  532. data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
  533. if (!data)
  534. return 0;
  535. }
  536. #ifdef SIMD_SSE
  537. #define TEMP __m128i
  538. #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
  539. #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  540. #define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  541. #define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
  542. #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  543. #endif
  544. #ifdef SIMD_NEON
  545. #define TEMP uint8x8_t
  546. #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
  547. #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
  548. #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
  549. #define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
  550. #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
  551. #endif
  552. PREP();
  553. unsigned char* savep = transposed + k;
  554. for (size_t j = 0; j < vertex_count_aligned; j += 16)
  555. {
  556. LOAD(0);
  557. LOAD(1);
  558. LOAD(2);
  559. LOAD(3);
  560. r0 = unzigzag8(r0);
  561. r1 = unzigzag8(r1);
  562. r2 = unzigzag8(r2);
  563. r3 = unzigzag8(r3);
  564. transpose8(r0, r1, r2, r3);
  565. TEMP t0, t1, t2, t3;
  566. GRP4(0);
  567. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  568. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  569. GRP4(1);
  570. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  571. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  572. GRP4(2);
  573. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  574. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  575. GRP4(3);
  576. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  577. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  578. #undef TEMP
  579. #undef PREP
  580. #undef LOAD
  581. #undef GRP4
  582. #undef FIXD
  583. #undef SAVE
  584. }
  585. }
  586. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  587. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  588. return data;
  589. }
  590. #endif
  591. } // namespace meshopt
  592. size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
  593. {
  594. using namespace meshopt;
  595. assert(vertex_size > 0 && vertex_size <= 256);
  596. assert(vertex_size % 4 == 0);
  597. #if TRACE
  598. memset(vertexstats, 0, sizeof(vertexstats));
  599. #endif
  600. const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
  601. unsigned char* data = buffer;
  602. unsigned char* data_end = buffer + buffer_size;
  603. if (size_t(data_end - data) < 1 + vertex_size)
  604. return 0;
  605. *data++ = kVertexHeader;
  606. unsigned char last_vertex[256] = {};
  607. if (vertex_count > 0)
  608. memcpy(last_vertex, vertex_data, vertex_size);
  609. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  610. size_t vertex_offset = 0;
  611. while (vertex_offset < vertex_count)
  612. {
  613. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  614. data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  615. if (!data)
  616. return 0;
  617. vertex_offset += block_size;
  618. }
  619. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  620. if (size_t(data_end - data) < tail_size)
  621. return 0;
  622. // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
  623. if (vertex_size < kTailMaxSize)
  624. {
  625. memset(data, 0, kTailMaxSize - vertex_size);
  626. data += kTailMaxSize - vertex_size;
  627. }
  628. memcpy(data, vertex_data, vertex_size);
  629. data += vertex_size;
  630. assert(data >= buffer + tail_size);
  631. assert(data <= buffer + buffer_size);
  632. #if TRACE
  633. size_t total_size = data - buffer;
  634. for (size_t k = 0; k < vertex_size; ++k)
  635. {
  636. const Stats& vsk = vertexstats[k];
  637. printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
  638. #if TRACE > 1
  639. printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
  640. int(vsk.header),
  641. int(vsk.bitg[0]), int(vsk.bitb[0]),
  642. int(vsk.bitg[1]), int(vsk.bitb[1]),
  643. int(vsk.bitg[2]), int(vsk.bitb[2]),
  644. int(vsk.bitg[3]), int(vsk.bitb[3]));
  645. #endif
  646. printf("\n");
  647. }
  648. #endif
  649. return data - buffer;
  650. }
  651. size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
  652. {
  653. using namespace meshopt;
  654. assert(vertex_size > 0 && vertex_size <= 256);
  655. assert(vertex_size % 4 == 0);
  656. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  657. size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
  658. size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
  659. size_t vertex_block_data_size = vertex_block_size;
  660. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  661. return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
  662. }
  663. int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
  664. {
  665. using namespace meshopt;
  666. assert(vertex_size > 0 && vertex_size <= 256);
  667. assert(vertex_size % 4 == 0);
  668. const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
  669. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  670. int cpuinfo[4] = {};
  671. __cpuid(cpuinfo, 1);
  672. decode = (cpuinfo[2] & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
  673. #elif defined(SIMD_SSE) || defined(SIMD_NEON)
  674. decode = decodeVertexBlockSimd;
  675. #else
  676. decode = decodeVertexBlock;
  677. #endif
  678. #if defined(SIMD_SSE) || defined(SIMD_NEON)
  679. assert(gDecodeBytesGroupInitialized);
  680. #endif
  681. unsigned char* vertex_data = static_cast<unsigned char*>(destination);
  682. const unsigned char* data = buffer;
  683. const unsigned char* data_end = buffer + buffer_size;
  684. if (size_t(data_end - data) < 1 + vertex_size)
  685. return -2;
  686. if (*data++ != kVertexHeader)
  687. return -1;
  688. unsigned char last_vertex[256];
  689. memcpy(last_vertex, data_end - vertex_size, vertex_size);
  690. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  691. size_t vertex_offset = 0;
  692. while (vertex_offset < vertex_count)
  693. {
  694. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  695. data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  696. if (!data)
  697. return -2;
  698. vertex_offset += block_size;
  699. }
  700. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  701. if (size_t(data_end - data) != tail_size)
  702. return -3;
  703. return 0;
  704. }