vertexcodec.cpp 56 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. // The block below auto-detects SIMD ISA that can be used on the target platform
  6. #ifndef MESHOPTIMIZER_NO_SIMD
  7. // The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
  8. #if defined(__AVX__) || defined(__SSSE3__)
  9. #define SIMD_SSE
  10. #endif
  11. // An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
  12. #if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
  13. #undef SIMD_SSE
  14. #define SIMD_AVX
  15. #endif
  16. // MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
  17. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
  18. #define SIMD_SSE
  19. #define SIMD_FALLBACK
  20. #endif
  21. // GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
  22. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
  23. #define SIMD_SSE
  24. #define SIMD_FALLBACK
  25. #define SIMD_TARGET __attribute__((target("ssse3")))
  26. #endif
  27. // GCC/clang define these when NEON support is available
  28. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  29. #define SIMD_NEON
  30. #endif
  31. // On MSVC, we assume that ARM builds always target NEON-capable devices
  32. #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
  33. #define SIMD_NEON
  34. #endif
  35. // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
  36. #if defined(__wasm_simd128__)
  37. #define SIMD_WASM
  38. // Prevent compiling other variant when wasm simd compilation is active
  39. #undef SIMD_NEON
  40. #undef SIMD_SSE
  41. #undef SIMD_AVX
  42. #endif
  43. #ifndef SIMD_TARGET
  44. #define SIMD_TARGET
  45. #endif
  46. // When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
  47. // We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
  48. #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
  49. #define SIMD_LATENCYOPT
  50. #endif
  51. // In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks
  52. #if defined(__GNUC__)
  53. #define SIMD_UNREACHABLE() __builtin_unreachable()
  54. #elif defined(_MSC_VER)
  55. #define SIMD_UNREACHABLE() __assume(false)
  56. #else
  57. #define SIMD_UNREACHABLE() assert(!"Unreachable")
  58. #endif
  59. #endif // !MESHOPTIMIZER_NO_SIMD
  60. #ifdef SIMD_SSE
  61. #include <tmmintrin.h>
  62. #endif
  63. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  64. #ifdef _MSC_VER
  65. #include <intrin.h> // __cpuid
  66. #else
  67. #include <cpuid.h> // __cpuid
  68. #endif
  69. #endif
  70. #ifdef SIMD_AVX
  71. #include <immintrin.h>
  72. #endif
  73. #ifdef SIMD_NEON
  74. #if defined(_MSC_VER) && defined(_M_ARM64)
  75. #include <arm64_neon.h>
  76. #else
  77. #include <arm_neon.h>
  78. #endif
  79. #endif
  80. #ifdef SIMD_WASM
  81. #include <wasm_simd128.h>
  82. #endif
  83. #ifndef TRACE
  84. #define TRACE 0
  85. #endif
  86. #if TRACE
  87. #include <stdio.h>
  88. #endif
  89. #ifdef SIMD_WASM
  90. #define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
  91. #define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
  92. #define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
  93. #define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
  94. #define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
  95. #define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
  96. #define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
  97. #endif
  98. namespace meshopt
  99. {
  100. const unsigned char kVertexHeader = 0xa0;
  101. static int gEncodeVertexVersion = 1;
  102. const int kDecodeVertexVersion = 1;
  103. const size_t kVertexBlockSizeBytes = 8192;
  104. const size_t kVertexBlockMaxSize = 256;
  105. const size_t kByteGroupSize = 16;
  106. const size_t kByteGroupDecodeLimit = 24;
  107. const size_t kTailMinSizeV0 = 32;
  108. const size_t kTailMinSizeV1 = 24;
  109. static const int kBitsV0[4] = {0, 2, 4, 8};
  110. static const int kBitsV1[5] = {0, 1, 2, 4, 8};
  111. const int kEncodeDefaultLevel = 2;
  112. static size_t getVertexBlockSize(size_t vertex_size)
  113. {
  114. // make sure the entire block fits into the scratch buffer and is aligned to byte group size
  115. // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
  116. size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);
  117. return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
  118. }
  119. inline unsigned int rotate(unsigned int v, int r)
  120. {
  121. return (v << r) | (v >> ((32 - r) & 31));
  122. }
  123. template <typename T>
  124. inline T zigzag(T v)
  125. {
  126. return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
  127. }
  128. template <typename T>
  129. inline T unzigzag(T v)
  130. {
  131. return (0 - (v & 1)) ^ (v >> 1);
  132. }
  133. #if TRACE
  134. struct Stats
  135. {
  136. size_t size;
  137. size_t header; // bytes for header
  138. size_t bitg[9]; // bytes for bit groups
  139. size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
  140. size_t ctrl[4]; // number of control groups
  141. };
  142. static Stats* bytestats = NULL;
  143. static Stats vertexstats[256];
  144. #endif
  145. static bool encodeBytesGroupZero(const unsigned char* buffer)
  146. {
  147. assert(kByteGroupSize == sizeof(unsigned long long) * 2);
  148. unsigned long long v[2];
  149. memcpy(v, buffer, sizeof(v));
  150. return (v[0] | v[1]) == 0;
  151. }
  152. static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
  153. {
  154. assert(bits >= 0 && bits <= 8);
  155. if (bits == 0)
  156. return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
  157. if (bits == 8)
  158. return kByteGroupSize;
  159. size_t result = kByteGroupSize * bits / 8;
  160. unsigned char sentinel = (1 << bits) - 1;
  161. for (size_t i = 0; i < kByteGroupSize; ++i)
  162. result += buffer[i] >= sentinel;
  163. return result;
  164. }
  165. static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
  166. {
  167. assert(bits >= 0 && bits <= 8);
  168. assert(kByteGroupSize % 8 == 0);
  169. if (bits == 0)
  170. return data;
  171. if (bits == 8)
  172. {
  173. memcpy(data, buffer, kByteGroupSize);
  174. return data + kByteGroupSize;
  175. }
  176. size_t byte_size = 8 / bits;
  177. assert(kByteGroupSize % byte_size == 0);
  178. // fixed portion: bits bits for each value
  179. // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
  180. unsigned char sentinel = (1 << bits) - 1;
  181. for (size_t i = 0; i < kByteGroupSize; i += byte_size)
  182. {
  183. unsigned char byte = 0;
  184. for (size_t k = 0; k < byte_size; ++k)
  185. {
  186. unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
  187. byte <<= bits;
  188. byte |= enc;
  189. }
  190. // encode 1-bit groups in reverse bit order
  191. // this makes them faster to decode alongside other groups
  192. if (bits == 1)
  193. byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
  194. *data++ = byte;
  195. }
  196. for (size_t i = 0; i < kByteGroupSize; ++i)
  197. {
  198. unsigned char v = buffer[i];
  199. // branchless append of out-of-range values
  200. *data = v;
  201. data += v >= sentinel;
  202. }
  203. return data;
  204. }
  205. static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4])
  206. {
  207. assert(buffer_size % kByteGroupSize == 0);
  208. unsigned char* header = data;
  209. // round number of groups to 4 to get number of header bytes
  210. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  211. if (size_t(data_end - data) < header_size)
  212. return NULL;
  213. data += header_size;
  214. memset(header, 0, header_size);
  215. int last_bits = -1;
  216. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  217. {
  218. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  219. return NULL;
  220. int best_bitk = 3;
  221. size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
  222. for (int bitk = 0; bitk < 3; ++bitk)
  223. {
  224. size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
  225. // favor consistent bit selection across groups, but never replace literals
  226. if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
  227. {
  228. best_bitk = bitk;
  229. best_size = size;
  230. }
  231. }
  232. size_t header_offset = i / kByteGroupSize;
  233. header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
  234. int best_bits = bits[best_bitk];
  235. unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
  236. assert(data + best_size == next);
  237. data = next;
  238. last_bits = best_bits;
  239. #if TRACE
  240. bytestats->bitg[best_bits] += best_size;
  241. #endif
  242. }
  243. #if TRACE
  244. bytestats->header += header_size;
  245. #endif
  246. return data;
  247. }
  248. template <typename T, bool Xor>
  249. static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot)
  250. {
  251. size_t k0 = k & ~(sizeof(T) - 1);
  252. int ks = (k & (sizeof(T) - 1)) * 8;
  253. T p = last_vertex[k0];
  254. for (size_t j = 1; j < sizeof(T); ++j)
  255. p |= T(last_vertex[k0 + j]) << (j * 8);
  256. const unsigned char* vertex = vertex_data + k0;
  257. for (size_t i = 0; i < vertex_count; ++i)
  258. {
  259. T v = vertex[0];
  260. for (size_t j = 1; j < sizeof(T); ++j)
  261. v |= vertex[j] << (j * 8);
  262. T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
  263. buffer[i] = (unsigned char)(d >> ks);
  264. p = v;
  265. vertex += vertex_size;
  266. }
  267. }
  268. static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel)
  269. {
  270. switch (channel & 3)
  271. {
  272. case 0:
  273. return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
  274. case 1:
  275. return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
  276. case 2:
  277. return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
  278. default:
  279. assert(!"Unsupported channel encoding"); // unreachable
  280. }
  281. }
  282. static int estimateBits(unsigned char v)
  283. {
  284. return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
  285. }
  286. static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size)
  287. {
  288. size_t sizes[8] = {};
  289. const unsigned char* vertex = vertex_data + k;
  290. unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
  291. for (size_t i = 0; i < vertex_count; i += group_size)
  292. {
  293. unsigned int bitg = 0;
  294. // calculate bit consistency mask for the group
  295. for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
  296. {
  297. unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
  298. unsigned int d = v ^ last;
  299. bitg |= d;
  300. last = v;
  301. vertex += vertex_size;
  302. }
  303. #if TRACE
  304. for (int j = 0; j < 32; ++j)
  305. vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
  306. #endif
  307. for (int j = 0; j < 8; ++j)
  308. {
  309. unsigned int bitr = rotate(bitg, j);
  310. sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
  311. sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
  312. }
  313. }
  314. int best_rot = 0;
  315. for (int rot = 1; rot < 8; ++rot)
  316. best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
  317. return best_rot;
  318. }
  319. static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot)
  320. {
  321. unsigned char block[kVertexBlockMaxSize];
  322. assert(vertex_block_size <= kVertexBlockMaxSize);
  323. unsigned char last_vertex[256] = {};
  324. size_t sizes[3] = {};
  325. assert(max_channel <= 3);
  326. for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
  327. {
  328. size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
  329. size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  330. memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
  331. // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  332. if (block_size < block_size_aligned)
  333. memset(block + block_size, 0, block_size_aligned - block_size);
  334. for (int channel = 0; channel < max_channel; ++channel)
  335. for (size_t j = 0; j < 4; ++j)
  336. {
  337. encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
  338. for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
  339. {
  340. // to maximize encoding performance we only evaluate 1/2/4/8 bit groups
  341. size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
  342. size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
  343. size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
  344. size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
  345. size_t best_size = size1 < size2 ? size1 : size2;
  346. best_size = best_size < size4 ? best_size : size4;
  347. best_size = best_size < size8 ? best_size : size8;
  348. sizes[channel] += best_size;
  349. }
  350. }
  351. }
  352. int best_channel = 0;
  353. for (int channel = 1; channel < max_channel; ++channel)
  354. best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
  355. return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
  356. }
  357. static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned)
  358. {
  359. for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
  360. if (!encodeBytesGroupZero(buffer + i))
  361. return false;
  362. return true;
  363. }
  364. static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)
  365. {
  366. if (estimateControlZero(buffer, vertex_count_aligned))
  367. return 2; // zero encoding
  368. if (level == 0)
  369. return 1; // 1248 encoding in level 0 for encoding speed
  370. // round number of groups to 4 to get number of header bytes
  371. size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
  372. size_t est_bytes0 = header_size, est_bytes1 = header_size;
  373. for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
  374. {
  375. // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
  376. size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
  377. size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
  378. size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
  379. size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
  380. size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
  381. // both control modes have access to 1/2/4 bit encoding
  382. size_t size12 = size1 < size2 ? size1 : size2;
  383. size_t size124 = size12 < size4 ? size12 : size4;
  384. // each control mode has access to 0/8 bit encoding respectively
  385. est_bytes0 += size124 < size0 ? size124 : size0;
  386. est_bytes1 += size124 < size8 ? size124 : size8;
  387. }
  388. // pick shortest control entry but prefer literal encoding
  389. if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
  390. return est_bytes0 < est_bytes1 ? 0 : 1;
  391. else
  392. return 3; // literal encoding
  393. }
  394. static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level)
  395. {
  396. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  397. assert(vertex_size % 4 == 0);
  398. unsigned char buffer[kVertexBlockMaxSize];
  399. assert(sizeof(buffer) % kByteGroupSize == 0);
  400. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  401. // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  402. memset(buffer, 0, sizeof(buffer));
  403. size_t control_size = version == 0 ? 0 : vertex_size / 4;
  404. if (size_t(data_end - data) < control_size)
  405. return NULL;
  406. unsigned char* control = data;
  407. data += control_size;
  408. memset(control, 0, control_size);
  409. for (size_t k = 0; k < vertex_size; ++k)
  410. {
  411. encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
  412. #if TRACE
  413. const unsigned char* olddata = data;
  414. bytestats = &vertexstats[k];
  415. #endif
  416. int ctrl = 0;
  417. if (version != 0)
  418. {
  419. ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
  420. assert(unsigned(ctrl) < 4);
  421. control[k / 4] |= ctrl << ((k % 4) * 2);
  422. #if TRACE
  423. vertexstats[k].ctrl[ctrl]++;
  424. #endif
  425. }
  426. if (ctrl == 3)
  427. {
  428. // literal encoding
  429. if (size_t(data_end - data) < vertex_count)
  430. return NULL;
  431. memcpy(data, buffer, vertex_count);
  432. data += vertex_count;
  433. }
  434. else if (ctrl != 2) // non-zero encoding
  435. {
  436. data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
  437. if (!data)
  438. return NULL;
  439. }
  440. #if TRACE
  441. bytestats = NULL;
  442. vertexstats[k].size += data - olddata;
  443. #endif
  444. }
  445. memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
  446. return data;
  447. }
  448. #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
  449. static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits)
  450. {
  451. #define READ() byte = *data++
  452. #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
  453. unsigned char byte, enc, encv;
  454. const unsigned char* data_var;
  455. switch (bits)
  456. {
  457. case 0:
  458. memset(buffer, 0, kByteGroupSize);
  459. return data;
  460. case 1:
  461. data_var = data + 2;
  462. // 2 groups with 8 1-bit values in each byte (reversed from the order in other groups)
  463. READ();
  464. byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
  465. NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
  466. READ();
  467. byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
  468. NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
  469. return data_var;
  470. case 2:
  471. data_var = data + 4;
  472. // 4 groups with 4 2-bit values in each byte
  473. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  474. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  475. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  476. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  477. return data_var;
  478. case 4:
  479. data_var = data + 8;
  480. // 8 groups with 2 4-bit values in each byte
  481. READ(), NEXT(4), NEXT(4);
  482. READ(), NEXT(4), NEXT(4);
  483. READ(), NEXT(4), NEXT(4);
  484. READ(), NEXT(4), NEXT(4);
  485. READ(), NEXT(4), NEXT(4);
  486. READ(), NEXT(4), NEXT(4);
  487. READ(), NEXT(4), NEXT(4);
  488. READ(), NEXT(4), NEXT(4);
  489. return data_var;
  490. case 8:
  491. memcpy(buffer, data, kByteGroupSize);
  492. return data + kByteGroupSize;
  493. default:
  494. assert(!"Unexpected bit length"); // unreachable
  495. return data;
  496. }
  497. #undef READ
  498. #undef NEXT
  499. }
  500. static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits)
  501. {
  502. assert(buffer_size % kByteGroupSize == 0);
  503. // round number of groups to 4 to get number of header bytes
  504. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  505. if (size_t(data_end - data) < header_size)
  506. return NULL;
  507. const unsigned char* header = data;
  508. data += header_size;
  509. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  510. {
  511. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  512. return NULL;
  513. size_t header_offset = i / kByteGroupSize;
  514. int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  515. data = decodeBytesGroup(data, buffer + i, bits[bitsk]);
  516. }
  517. return data;
  518. }
  519. template <typename T, bool Xor>
  520. static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot)
  521. {
  522. for (size_t k = 0; k < 4; k += sizeof(T))
  523. {
  524. size_t vertex_offset = k;
  525. T p = last_vertex[0];
  526. for (size_t j = 1; j < sizeof(T); ++j)
  527. p |= last_vertex[j] << (8 * j);
  528. for (size_t i = 0; i < vertex_count; ++i)
  529. {
  530. T v = buffer[i];
  531. for (size_t j = 1; j < sizeof(T); ++j)
  532. v |= buffer[i + vertex_count * j] << (8 * j);
  533. v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p;
  534. for (size_t j = 0; j < sizeof(T); ++j)
  535. transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8));
  536. p = v;
  537. vertex_offset += vertex_size;
  538. }
  539. buffer += vertex_count * sizeof(T);
  540. last_vertex += sizeof(T);
  541. }
  542. }
  543. static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
  544. {
  545. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  546. unsigned char buffer[kVertexBlockMaxSize * 4];
  547. unsigned char transposed[kVertexBlockSizeBytes];
  548. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  549. assert(vertex_count <= vertex_count_aligned);
  550. size_t control_size = version == 0 ? 0 : vertex_size / 4;
  551. if (size_t(data_end - data) < control_size)
  552. return NULL;
  553. const unsigned char* control = data;
  554. data += control_size;
  555. for (size_t k = 0; k < vertex_size; k += 4)
  556. {
  557. unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
  558. for (size_t j = 0; j < 4; ++j)
  559. {
  560. int ctrl = (ctrl_byte >> (j * 2)) & 3;
  561. if (ctrl == 3)
  562. {
  563. // literal encoding
  564. if (size_t(data_end - data) < vertex_count)
  565. return NULL;
  566. memcpy(buffer + j * vertex_count, data, vertex_count);
  567. data += vertex_count;
  568. }
  569. else if (ctrl == 2)
  570. {
  571. // zero encoding
  572. memset(buffer + j * vertex_count, 0, vertex_count);
  573. }
  574. else
  575. {
  576. data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
  577. if (!data)
  578. return NULL;
  579. }
  580. }
  581. int channel = version == 0 ? 0 : channels[k / 4];
  582. switch (channel & 3)
  583. {
  584. case 0:
  585. decodeDeltas1<unsigned char, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
  586. break;
  587. case 1:
  588. decodeDeltas1<unsigned short, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
  589. break;
  590. case 2:
  591. decodeDeltas1<unsigned int, true>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
  592. break;
  593. default:
  594. return NULL; // invalid channel type
  595. }
  596. }
  597. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  598. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  599. return data;
  600. }
  601. #endif
  602. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  603. static unsigned char kDecodeBytesGroupShuffle[256][8];
  604. static unsigned char kDecodeBytesGroupCount[256];
  605. #ifdef __wasm__
  606. __attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
  607. #endif
  608. static bool
  609. decodeBytesGroupBuildTables()
  610. {
  611. for (int mask = 0; mask < 256; ++mask)
  612. {
  613. unsigned char shuffle[8];
  614. unsigned char count = 0;
  615. for (int i = 0; i < 8; ++i)
  616. {
  617. int maski = (mask >> i) & 1;
  618. shuffle[i] = maski ? count : 0x80;
  619. count += (unsigned char)(maski);
  620. }
  621. memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
  622. kDecodeBytesGroupCount[mask] = count;
  623. }
  624. return true;
  625. }
  626. static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
  627. #endif
  628. #ifdef SIMD_SSE
  629. SIMD_TARGET
  630. inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  631. {
  632. __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
  633. __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
  634. __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
  635. __m128i sm1r = _mm_add_epi8(sm1, sm1off);
  636. return _mm_unpacklo_epi64(sm0, sm1r);
  637. }
  638. SIMD_TARGET
  639. inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
  640. {
  641. switch (hbits)
  642. {
  643. case 0:
  644. case 4:
  645. {
  646. __m128i result = _mm_setzero_si128();
  647. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  648. return data;
  649. }
  650. case 1:
  651. case 6:
  652. {
  653. #ifdef __GNUC__
  654. typedef int __attribute__((aligned(1))) unaligned_int;
  655. #else
  656. typedef int unaligned_int;
  657. #endif
  658. #ifdef SIMD_LATENCYOPT
  659. unsigned int data32;
  660. memcpy(&data32, data, 4);
  661. data32 &= data32 >> 1;
  662. // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
  663. unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
  664. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  665. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  666. #endif
  667. __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
  668. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
  669. __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
  670. __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
  671. __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
  672. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
  673. int mask16 = _mm_movemask_epi8(mask);
  674. unsigned char mask0 = (unsigned char)(mask16 & 255);
  675. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  676. __m128i shuf = decodeShuffleMask(mask0, mask1);
  677. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  678. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  679. #ifdef SIMD_LATENCYOPT
  680. return data + 4 + datacnt;
  681. #else
  682. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  683. #endif
  684. }
  685. case 2:
  686. case 7:
  687. {
  688. #ifdef SIMD_LATENCYOPT
  689. unsigned long long data64;
  690. memcpy(&data64, data, 8);
  691. data64 &= data64 >> 1;
  692. data64 &= data64 >> 2;
  693. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  694. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  695. #endif
  696. __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  697. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
  698. __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
  699. __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
  700. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
  701. int mask16 = _mm_movemask_epi8(mask);
  702. unsigned char mask0 = (unsigned char)(mask16 & 255);
  703. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  704. __m128i shuf = decodeShuffleMask(mask0, mask1);
  705. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  706. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  707. #ifdef SIMD_LATENCYOPT
  708. return data + 8 + datacnt;
  709. #else
  710. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  711. #endif
  712. }
  713. case 3:
  714. case 8:
  715. {
  716. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  717. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  718. return data + 16;
  719. }
  720. case 5:
  721. {
  722. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 2));
  723. unsigned char mask0 = data[0];
  724. unsigned char mask1 = data[1];
  725. __m128i shuf = decodeShuffleMask(mask0, mask1);
  726. __m128i result = _mm_shuffle_epi8(rest, shuf);
  727. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  728. return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  729. }
  730. default:
  731. SIMD_UNREACHABLE(); // unreachable
  732. }
  733. }
  734. #endif
  735. #ifdef SIMD_AVX
  736. static const __m128i kDecodeBytesGroupConfig[8][2] = {
  737. {_mm_setzero_si128(), _mm_setzero_si128()},
  738. {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
  739. {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
  740. {_mm_setzero_si128(), _mm_setzero_si128()},
  741. {_mm_setzero_si128(), _mm_setzero_si128()},
  742. {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)},
  743. {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
  744. {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
  745. };
  746. SIMD_TARGET
  747. inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
  748. {
  749. switch (hbits)
  750. {
  751. case 0:
  752. case 4:
  753. {
  754. __m128i result = _mm_setzero_si128();
  755. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  756. return data;
  757. }
  758. case 5: // 1-bit
  759. case 1: // 2-bit
  760. case 6:
  761. case 2: // 4-bit
  762. case 7:
  763. {
  764. const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5));
  765. __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  766. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
  767. __m128i sent = kDecodeBytesGroupConfig[hbits][0];
  768. __m128i ctrl = kDecodeBytesGroupConfig[hbits][1];
  769. __m128i selw = _mm_shuffle_epi32(selb, 0x44);
  770. __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
  771. __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
  772. __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
  773. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  774. return skip + _mm_popcnt_u32(mask16);
  775. }
  776. case 3:
  777. case 8:
  778. {
  779. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  780. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  781. return data + 16;
  782. }
  783. default:
  784. SIMD_UNREACHABLE(); // unreachable
  785. }
  786. }
  787. #endif
  788. #ifdef SIMD_NEON
  789. SIMD_TARGET
  790. inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
  791. {
  792. uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
  793. uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
  794. uint8x8_t r0 = vtbl1_u8(rest0, sm0);
  795. uint8x8_t r1 = vtbl1_u8(rest1, sm1);
  796. return vcombine_u8(r0, r1);
  797. }
  798. SIMD_TARGET
  799. inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
  800. {
  801. // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
  802. const uint64_t magic = 0x000103070f1f3f80ull;
  803. uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
  804. mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
  805. mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
  806. }
  807. SIMD_TARGET
  808. inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
  809. {
  810. switch (hbits)
  811. {
  812. case 0:
  813. case 4:
  814. {
  815. uint8x16_t result = vdupq_n_u8(0);
  816. vst1q_u8(buffer, result);
  817. return data;
  818. }
  819. case 1:
  820. case 6:
  821. {
  822. #ifdef SIMD_LATENCYOPT
  823. unsigned int data32;
  824. memcpy(&data32, data, 4);
  825. data32 &= data32 >> 1;
  826. // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
  827. unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
  828. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  829. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  830. #endif
  831. uint8x8_t sel2 = vld1_u8(data);
  832. uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
  833. uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
  834. uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
  835. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
  836. unsigned char mask0, mask1;
  837. neonMoveMask(mask, mask0, mask1);
  838. uint8x8_t rest0 = vld1_u8(data + 4);
  839. uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
  840. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  841. vst1q_u8(buffer, result);
  842. #ifdef SIMD_LATENCYOPT
  843. return data + 4 + datacnt;
  844. #else
  845. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  846. #endif
  847. }
  848. case 2:
  849. case 7:
  850. {
  851. #ifdef SIMD_LATENCYOPT
  852. unsigned long long data64;
  853. memcpy(&data64, data, 8);
  854. data64 &= data64 >> 1;
  855. data64 &= data64 >> 2;
  856. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  857. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  858. #endif
  859. uint8x8_t sel4 = vld1_u8(data);
  860. uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
  861. uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
  862. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
  863. unsigned char mask0, mask1;
  864. neonMoveMask(mask, mask0, mask1);
  865. uint8x8_t rest0 = vld1_u8(data + 8);
  866. uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
  867. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  868. vst1q_u8(buffer, result);
  869. #ifdef SIMD_LATENCYOPT
  870. return data + 8 + datacnt;
  871. #else
  872. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  873. #endif
  874. }
  875. case 3:
  876. case 8:
  877. {
  878. uint8x16_t result = vld1q_u8(data);
  879. vst1q_u8(buffer, result);
  880. return data + 16;
  881. }
  882. case 5:
  883. {
  884. unsigned char mask0 = data[0];
  885. unsigned char mask1 = data[1];
  886. uint8x8_t rest0 = vld1_u8(data + 2);
  887. uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]);
  888. uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1);
  889. vst1q_u8(buffer, result);
  890. return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  891. }
  892. default:
  893. SIMD_UNREACHABLE(); // unreachable
  894. }
  895. }
  896. #endif
  897. #ifdef SIMD_WASM
  898. SIMD_TARGET
  899. inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  900. {
  901. v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
  902. v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
  903. v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]);
  904. v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
  905. return wasmx_unpacklo_v64x2(sm0, sm1r);
  906. }
  907. SIMD_TARGET
  908. inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
  909. {
  910. // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
  911. const uint64_t magic = 0x000103070f1f3f80ull;
  912. mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
  913. mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
  914. }
  915. SIMD_TARGET
  916. inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
  917. {
  918. switch (hbits)
  919. {
  920. case 0:
  921. case 4:
  922. {
  923. v128_t result = wasm_i8x16_splat(0);
  924. wasm_v128_store(buffer, result);
  925. return data;
  926. }
  927. case 1:
  928. case 6:
  929. {
  930. v128_t sel2 = wasm_v128_load(data);
  931. v128_t rest = wasm_v128_load(data + 4);
  932. v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
  933. v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
  934. v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
  935. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
  936. unsigned char mask0, mask1;
  937. wasmMoveMask(mask, mask0, mask1);
  938. v128_t shuf = decodeShuffleMask(mask0, mask1);
  939. v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
  940. wasm_v128_store(buffer, result);
  941. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  942. }
  943. case 2:
  944. case 7:
  945. {
  946. v128_t sel4 = wasm_v128_load(data);
  947. v128_t rest = wasm_v128_load(data + 8);
  948. v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
  949. v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
  950. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
  951. unsigned char mask0, mask1;
  952. wasmMoveMask(mask, mask0, mask1);
  953. v128_t shuf = decodeShuffleMask(mask0, mask1);
  954. v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
  955. wasm_v128_store(buffer, result);
  956. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  957. }
  958. case 3:
  959. case 8:
  960. {
  961. v128_t result = wasm_v128_load(data);
  962. wasm_v128_store(buffer, result);
  963. return data + 16;
  964. }
  965. case 5:
  966. {
  967. v128_t rest = wasm_v128_load(data + 2);
  968. unsigned char mask0 = data[0];
  969. unsigned char mask1 = data[1];
  970. v128_t shuf = decodeShuffleMask(mask0, mask1);
  971. v128_t result = wasm_i8x16_swizzle(rest, shuf);
  972. wasm_v128_store(buffer, result);
  973. return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  974. }
  975. default:
  976. SIMD_UNREACHABLE(); // unreachable
  977. }
  978. }
  979. #endif
  980. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  981. SIMD_TARGET
  982. inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
  983. {
  984. __m128i t0 = _mm_unpacklo_epi8(x0, x1);
  985. __m128i t1 = _mm_unpackhi_epi8(x0, x1);
  986. __m128i t2 = _mm_unpacklo_epi8(x2, x3);
  987. __m128i t3 = _mm_unpackhi_epi8(x2, x3);
  988. x0 = _mm_unpacklo_epi16(t0, t2);
  989. x1 = _mm_unpackhi_epi16(t0, t2);
  990. x2 = _mm_unpacklo_epi16(t1, t3);
  991. x3 = _mm_unpackhi_epi16(t1, t3);
  992. }
  993. SIMD_TARGET
  994. inline __m128i unzigzag8(__m128i v)
  995. {
  996. __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
  997. __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
  998. return _mm_xor_si128(xl, xr);
  999. }
  1000. SIMD_TARGET
  1001. inline __m128i unzigzag16(__m128i v)
  1002. {
  1003. __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
  1004. __m128i xr = _mm_srli_epi16(v, 1);
  1005. return _mm_xor_si128(xl, xr);
  1006. }
  1007. SIMD_TARGET
  1008. inline __m128i rotate32(__m128i v, int r)
  1009. {
  1010. return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
  1011. }
  1012. #endif
  1013. #ifdef SIMD_NEON
  1014. SIMD_TARGET
  1015. inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
  1016. {
  1017. uint8x16x2_t t01 = vzipq_u8(x0, x1);
  1018. uint8x16x2_t t23 = vzipq_u8(x2, x3);
  1019. uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
  1020. uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
  1021. x0 = vreinterpretq_u8_u16(x01.val[0]);
  1022. x1 = vreinterpretq_u8_u16(x01.val[1]);
  1023. x2 = vreinterpretq_u8_u16(x23.val[0]);
  1024. x3 = vreinterpretq_u8_u16(x23.val[1]);
  1025. }
  1026. SIMD_TARGET
  1027. inline uint8x16_t unzigzag8(uint8x16_t v)
  1028. {
  1029. uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
  1030. uint8x16_t xr = vshrq_n_u8(v, 1);
  1031. return veorq_u8(xl, xr);
  1032. }
  1033. SIMD_TARGET
  1034. inline uint8x16_t unzigzag16(uint8x16_t v)
  1035. {
  1036. uint16x8_t vv = vreinterpretq_u16_u8(v);
  1037. uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1)))));
  1038. uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1));
  1039. return veorq_u8(xl, xr);
  1040. }
  1041. SIMD_TARGET
  1042. inline uint8x16_t rotate32(uint8x16_t v, int r)
  1043. {
  1044. uint32x4_t v32 = vreinterpretq_u32_u8(v);
  1045. return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));
  1046. }
  1047. template <int Channel>
  1048. SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3)
  1049. {
  1050. switch (Channel)
  1051. {
  1052. case 0:
  1053. {
  1054. uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3));
  1055. uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum));
  1056. return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
  1057. }
  1058. case 1:
  1059. {
  1060. uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3)));
  1061. uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum));
  1062. return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2)));
  1063. }
  1064. case 2:
  1065. {
  1066. uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3));
  1067. uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum));
  1068. return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
  1069. }
  1070. default:
  1071. return npi;
  1072. }
  1073. }
  1074. #endif
  1075. #ifdef SIMD_WASM
  1076. SIMD_TARGET
  1077. inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
  1078. {
  1079. v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
  1080. v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
  1081. v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
  1082. v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
  1083. x0 = wasmx_unpacklo_v16x8(t0, t2);
  1084. x1 = wasmx_unpackhi_v16x8(t0, t2);
  1085. x2 = wasmx_unpacklo_v16x8(t1, t3);
  1086. x3 = wasmx_unpackhi_v16x8(t1, t3);
  1087. }
  1088. SIMD_TARGET
  1089. inline v128_t unzigzag8(v128_t v)
  1090. {
  1091. v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
  1092. v128_t xr = wasm_u8x16_shr(v, 1);
  1093. return wasm_v128_xor(xl, xr);
  1094. }
  1095. SIMD_TARGET
  1096. inline v128_t unzigzag16(v128_t v)
  1097. {
  1098. v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1)));
  1099. v128_t xr = wasm_u16x8_shr(v, 1);
  1100. return wasm_v128_xor(xl, xr);
  1101. }
  1102. SIMD_TARGET
  1103. inline v128_t rotate32(v128_t v, int r)
  1104. {
  1105. return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r));
  1106. }
  1107. #endif
  1108. #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  1109. SIMD_TARGET
  1110. static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift)
  1111. {
  1112. assert(buffer_size % kByteGroupSize == 0);
  1113. assert(kByteGroupSize == 16);
  1114. // round number of groups to 4 to get number of header bytes
  1115. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  1116. if (size_t(data_end - data) < header_size)
  1117. return NULL;
  1118. const unsigned char* header = data;
  1119. data += header_size;
  1120. size_t i = 0;
  1121. // fast-path: process 4 groups at a time, do a shared bounds check
  1122. for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
  1123. {
  1124. size_t header_offset = i / kByteGroupSize;
  1125. unsigned char header_byte = header[header_offset / 4];
  1126. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
  1127. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
  1128. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
  1129. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
  1130. }
  1131. // slow-path: process remaining groups
  1132. for (; i < buffer_size; i += kByteGroupSize)
  1133. {
  1134. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  1135. return NULL;
  1136. size_t header_offset = i / kByteGroupSize;
  1137. unsigned char header_byte = header[header_offset / 4];
  1138. data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
  1139. }
  1140. return data;
  1141. }
  1142. template <int Channel>
  1143. SIMD_TARGET static void
  1144. decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot)
  1145. {
  1146. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  1147. #define TEMP __m128i
  1148. #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
  1149. #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  1150. #define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  1151. #define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  1152. #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  1153. #endif
  1154. #ifdef SIMD_NEON
  1155. #define TEMP uint8x8_t
  1156. #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
  1157. #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
  1158. #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
  1159. #define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
  1160. #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
  1161. #endif
  1162. #ifdef SIMD_WASM
  1163. #define TEMP v128_t
  1164. #define PREP() v128_t pi = wasm_v128_load(last_vertex)
  1165. #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
  1166. #define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
  1167. #define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
  1168. #define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
  1169. #endif
  1170. #define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  1171. PREP();
  1172. unsigned char* savep = transposed;
  1173. for (size_t j = 0; j < vertex_count_aligned; j += 16)
  1174. {
  1175. LOAD(0);
  1176. LOAD(1);
  1177. LOAD(2);
  1178. LOAD(3);
  1179. transpose8(r0, r1, r2, r3);
  1180. TEMP t0, t1, t2, t3;
  1181. TEMP npi = pi;
  1182. UNZR(0);
  1183. GRP4(0);
  1184. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  1185. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  1186. UNZR(1);
  1187. GRP4(1);
  1188. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  1189. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  1190. UNZR(2);
  1191. GRP4(2);
  1192. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  1193. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  1194. UNZR(3);
  1195. GRP4(3);
  1196. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  1197. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  1198. #if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
  1199. // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
  1200. pi = rebase<Channel>(npi, r0, r1, r2, r3);
  1201. #else
  1202. (void)npi;
  1203. #endif
  1204. #undef UNZR
  1205. #undef TEMP
  1206. #undef PREP
  1207. #undef LOAD
  1208. #undef GRP4
  1209. #undef FIXD
  1210. #undef SAVE
  1211. }
  1212. }
  1213. SIMD_TARGET
  1214. static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
  1215. {
  1216. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  1217. unsigned char buffer[kVertexBlockMaxSize * 4];
  1218. unsigned char transposed[kVertexBlockSizeBytes];
  1219. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  1220. size_t control_size = version == 0 ? 0 : vertex_size / 4;
  1221. if (size_t(data_end - data) < control_size)
  1222. return NULL;
  1223. const unsigned char* control = data;
  1224. data += control_size;
  1225. for (size_t k = 0; k < vertex_size; k += 4)
  1226. {
  1227. unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
  1228. for (size_t j = 0; j < 4; ++j)
  1229. {
  1230. int ctrl = (ctrl_byte >> (j * 2)) & 3;
  1231. if (ctrl == 3)
  1232. {
  1233. // literal encoding; safe to over-copy due to tail
  1234. if (size_t(data_end - data) < vertex_count_aligned)
  1235. return NULL;
  1236. memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
  1237. data += vertex_count;
  1238. }
  1239. else if (ctrl == 2)
  1240. {
  1241. // zero encoding
  1242. memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
  1243. }
  1244. else
  1245. {
  1246. // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
  1247. int hshift = version == 0 ? 0 : 4 + ctrl;
  1248. data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
  1249. if (!data)
  1250. return NULL;
  1251. }
  1252. }
  1253. int channel = version == 0 ? 0 : channels[k / 4];
  1254. switch (channel & 3)
  1255. {
  1256. case 0:
  1257. decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
  1258. break;
  1259. case 1:
  1260. decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
  1261. break;
  1262. case 2:
  1263. decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
  1264. break;
  1265. default:
  1266. return NULL; // invalid channel type
  1267. }
  1268. }
  1269. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  1270. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  1271. return data;
  1272. }
  1273. #endif
  1274. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  1275. static unsigned int getCpuFeatures()
  1276. {
  1277. int cpuinfo[4] = {};
  1278. #ifdef _MSC_VER
  1279. __cpuid(cpuinfo, 1);
  1280. #else
  1281. __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
  1282. #endif
  1283. return cpuinfo[2];
  1284. }
  1285. static unsigned int cpuid = getCpuFeatures();
  1286. #endif
  1287. } // namespace meshopt
  1288. size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version)
  1289. {
  1290. using namespace meshopt;
  1291. assert(vertex_size > 0 && vertex_size <= 256);
  1292. assert(vertex_size % 4 == 0);
  1293. assert(level >= 0 && level <= 9); // only a subset of this range is used right now
  1294. assert(version < 0 || unsigned(version) <= kDecodeVertexVersion);
  1295. version = version < 0 ? gEncodeVertexVersion : version;
  1296. #if TRACE
  1297. memset(vertexstats, 0, sizeof(vertexstats));
  1298. #endif
  1299. const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
  1300. unsigned char* data = buffer;
  1301. unsigned char* data_end = buffer + buffer_size;
  1302. if (size_t(data_end - data) < 1)
  1303. return 0;
  1304. *data++ = (unsigned char)(kVertexHeader | version);
  1305. unsigned char first_vertex[256] = {};
  1306. if (vertex_count > 0)
  1307. memcpy(first_vertex, vertex_data, vertex_size);
  1308. unsigned char last_vertex[256] = {};
  1309. memcpy(last_vertex, first_vertex, vertex_size);
  1310. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  1311. unsigned char channels[64] = {};
  1312. if (version != 0 && level > 1 && vertex_count > 1)
  1313. for (size_t k = 0; k < vertex_size; k += 4)
  1314. {
  1315. int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;
  1316. int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);
  1317. assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));
  1318. channels[k / 4] = (unsigned char)channel;
  1319. }
  1320. size_t vertex_offset = 0;
  1321. while (vertex_offset < vertex_count)
  1322. {
  1323. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  1324. data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);
  1325. if (!data)
  1326. return 0;
  1327. vertex_offset += block_size;
  1328. }
  1329. size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
  1330. size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
  1331. size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  1332. if (size_t(data_end - data) < tail_size_pad)
  1333. return 0;
  1334. if (tail_size < tail_size_pad)
  1335. {
  1336. memset(data, 0, tail_size_pad - tail_size);
  1337. data += tail_size_pad - tail_size;
  1338. }
  1339. memcpy(data, first_vertex, vertex_size);
  1340. data += vertex_size;
  1341. if (version != 0)
  1342. {
  1343. memcpy(data, channels, vertex_size / 4);
  1344. data += vertex_size / 4;
  1345. }
  1346. assert(data >= buffer + tail_size);
  1347. assert(data <= buffer + buffer_size);
  1348. #if TRACE
  1349. size_t total_size = data - buffer;
  1350. for (size_t k = 0; k < vertex_size; ++k)
  1351. {
  1352. const Stats& vsk = vertexstats[k];
  1353. printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
  1354. size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];
  1355. double total_kr = total_k ? 1.0 / double(total_k) : 0;
  1356. if (version != 0)
  1357. {
  1358. int channel = channels[k / 4];
  1359. if ((channel & 3) == 2 && k % 4 == 0)
  1360. printf(" | ^%d", channel >> 4);
  1361. else
  1362. printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));
  1363. }
  1364. printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",
  1365. double(vsk.header) * total_kr * 100,
  1366. double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,
  1367. double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);
  1368. size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];
  1369. if (total_ctrl)
  1370. {
  1371. printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",
  1372. double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,
  1373. double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
  1374. }
  1375. if (level >= 3)
  1376. printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
  1377. double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
  1378. double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
  1379. double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
  1380. double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
  1381. printf("\n");
  1382. }
  1383. #endif
  1384. return data - buffer;
  1385. }
  1386. size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
  1387. {
  1388. return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel, meshopt::gEncodeVertexVersion);
  1389. }
  1390. size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
  1391. {
  1392. using namespace meshopt;
  1393. assert(vertex_size > 0 && vertex_size <= 256);
  1394. assert(vertex_size % 4 == 0);
  1395. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  1396. size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
  1397. size_t vertex_block_control_size = vertex_size / 4;
  1398. size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
  1399. size_t vertex_block_data_size = vertex_block_size;
  1400. size_t tail_size = vertex_size + (vertex_size / 4);
  1401. size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;
  1402. size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  1403. assert(tail_size_pad >= kByteGroupDecodeLimit);
  1404. return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;
  1405. }
  1406. void meshopt_encodeVertexVersion(int version)
  1407. {
  1408. assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion));
  1409. meshopt::gEncodeVertexVersion = version;
  1410. }
  1411. int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size)
  1412. {
  1413. if (buffer_size < 1)
  1414. return -1;
  1415. unsigned char header = buffer[0];
  1416. if ((header & 0xf0) != meshopt::kVertexHeader)
  1417. return -1;
  1418. int version = header & 0x0f;
  1419. if (version > meshopt::kDecodeVertexVersion)
  1420. return -1;
  1421. return version;
  1422. }
  1423. int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
  1424. {
  1425. using namespace meshopt;
  1426. assert(vertex_size > 0 && vertex_size <= 256);
  1427. assert(vertex_size % 4 == 0);
  1428. const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;
  1429. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  1430. decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
  1431. #elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  1432. decode = decodeVertexBlockSimd;
  1433. #else
  1434. decode = decodeVertexBlock;
  1435. #endif
  1436. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  1437. assert(gDecodeBytesGroupInitialized);
  1438. (void)gDecodeBytesGroupInitialized;
  1439. #endif
  1440. unsigned char* vertex_data = static_cast<unsigned char*>(destination);
  1441. const unsigned char* data = buffer;
  1442. const unsigned char* data_end = buffer + buffer_size;
  1443. if (size_t(data_end - data) < 1)
  1444. return -2;
  1445. unsigned char data_header = *data++;
  1446. if ((data_header & 0xf0) != kVertexHeader)
  1447. return -1;
  1448. int version = data_header & 0x0f;
  1449. if (version > kDecodeVertexVersion)
  1450. return -1;
  1451. size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
  1452. size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
  1453. size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  1454. if (size_t(data_end - data) < tail_size_pad)
  1455. return -2;
  1456. const unsigned char* tail = data_end - tail_size;
  1457. unsigned char last_vertex[256];
  1458. memcpy(last_vertex, tail, vertex_size);
  1459. const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;
  1460. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  1461. size_t vertex_offset = 0;
  1462. while (vertex_offset < vertex_count)
  1463. {
  1464. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  1465. data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);
  1466. if (!data)
  1467. return -2;
  1468. vertex_offset += block_size;
  1469. }
  1470. if (size_t(data_end - data) != tail_size_pad)
  1471. return -3;
  1472. return 0;
  1473. }
  1474. #undef SIMD_NEON
  1475. #undef SIMD_SSE
  1476. #undef SIMD_AVX
  1477. #undef SIMD_WASM
  1478. #undef SIMD_FALLBACK
  1479. #undef SIMD_TARGET
  1480. #undef SIMD_LATENCYOPT