integerCoding.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. // TODO(syoyo) Report fatal error
  2. //
  3. // Copyright 2017 Pixar
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "Apache License")
  6. // with the following modification; you may not use this file except in
  7. // compliance with the Apache License and the following modification to it:
  8. // Section 6. Trademarks. is deleted and replaced with:
  9. //
  10. // 6. Trademarks. This License does not grant permission to use the trade
  11. // names, trademarks, service marks, or product names of the Licensor
  12. // and its affiliates, except as required to comply with Section 4(c) of
  13. // the License and to reproduce the content of the NOTICE file.
  14. //
  15. // You may obtain a copy of the Apache License at
  16. //
  17. // http://www.apache.org/licenses/LICENSE-2.0
  18. //
  19. // Unless required by applicable law or agreed to in writing, software
  20. // distributed under the Apache License with the above modification is
  21. // distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  22. // KIND, either express or implied. See the Apache License for the specific
  23. // language governing permissions and limitations under the Apache License.
  24. //
  25. //#include "pxr/pxr.h"
  26. //#include "pxr/base/tf/diagnostic.h"
  27. //#include "pxr/base/tf/fastCompression.h"
  28. //#include "pxr/usd/usd/integerCoding.h"
  29. #include "lz4-compression.hh"
  30. #include "integerCoding.h"
  31. #include <cstdint>
  32. #include <cstring>
  33. #include <limits>
  34. #include <memory>
  35. #include <unordered_map>
  36. //PXR_NAMESPACE_OPEN_SCOPE
  37. namespace tinyusdz {
  38. /*
  39. These integer coding & compression routines are tailored for what are typically
  40. lists of indexes into other tables. The binary "usdc" file format has lots of
  41. these in its "structural sections" that define the object hierarchy.
  42. The basic idea is to take a contiguous list of 32-bit integers and encode them
  43. in a buffer that is not only smaller, but also still quite compressible by a
  44. general compression algorithm, and then compress that buffer to produce a final
  45. result. Decompression proceeds by going in reverse. The general compressor is
  46. LZ4 via TfFastCompression. The integer coding scheme implemented here is
  47. described below.
  48. We encode a list of integers as follows. First we transform the input to
  49. produce a new list of integers where each element is the difference between it
  50. and the previous integer in the input sequence. This is the sequence we encode.
  51. Next we find the most common value in the sequence and write it to the output.
  52. Then we write 2-bit codes, one for each integer, classifying it. Finally we
  53. write a variable length section of integer data. The decoder uses the 2-bit
  54. codes to understand how to interpret this variable length data.
  55. Given a list of integers, say:
  56. input = [123, 124, 125, 100125, 100125, 100126, 100126]
  57. We encode as follows. First, we transform the list to be the list of
  58. differences to the previous integer, or the integer itself for the first element
  59. in the list (this can be considered a difference to 0) to get:
  60. input_diffs = [123, 1, 1, 100000, 0, 1, 0]
  61. Then we find the most commonly occurring value in this sequence, which is '1'.
  62. We write this most commonly occurring value into the output stream.
  63. output = [int32(1)]
  64. Next we write two sections, first a fixed length section, 2-bit codes per
  65. integer, followed by a variable length section of integer data. The two bit
  66. code indicates what "kind" of integer we have:
  67. 00: The most common value
  68. 01: 8-bit integer
  69. 10: 16-bit integer
  70. 11: 32-bit integer
  71. For our example this gives:
  72. input = [123, 124, 125, 100125, 100125, 100126, 10026]
  73. output = [int32(1) 01 00 00 11 01 00 01 XX int8(123) int32(100000) int8(0) int8(0)]
  74. Where 'XX' represents unused bits in the last byte of the codes section to round
  75. up to an even number of bytes.
  76. In this case the output size is 12 bytes compared to the original input which
  77. was 28 bytes. In the best possible case the output is (asymptotically) 2 bits
  78. per integer (6.25% the original size), in the worst possible case it is
  79. (asymptotically) 34 bits per integer (106.25% the original size).
  80. */
  81. namespace {
  82. template <class Int>
  83. inline typename std::enable_if<
  84. std::is_integral<Int>::value &&
  85. std::is_unsigned<Int>::value &&
  86. sizeof(Int) == 4,
  87. int32_t>::type
  88. _Signed(Int x)
  89. {
  90. if (x <= static_cast<uint32_t>(INT32_MAX))
  91. return static_cast<int32_t>(x);
  92. if (x >= static_cast<uint32_t>(INT32_MIN))
  93. return static_cast<int32_t>(x - INT32_MIN) + INT32_MIN;
  94. //TF_FATAL_ERROR("Unsupported C++ integer representation");
  95. return 0;
  96. }
  97. template <class Int>
  98. inline typename std::enable_if<
  99. std::is_integral<Int>::value &&
  100. std::is_signed<Int>::value &&
  101. sizeof(Int) == 4,
  102. int32_t>::type
  103. _Signed(Int x)
  104. {
  105. return x;
  106. }
  107. template <class Int>
  108. inline typename std::enable_if<
  109. std::is_integral<Int>::value &&
  110. std::is_unsigned<Int>::value &&
  111. sizeof(Int) == 8,
  112. int64_t>::type
  113. _Signed(Int x)
  114. {
  115. if (x <= static_cast<uint64_t>(INT64_MAX))
  116. return static_cast<int64_t>(x);
  117. if (x >= static_cast<uint64_t>(INT64_MIN))
  118. return static_cast<int64_t>(x - INT64_MIN) + INT64_MIN;
  119. //TF_FATAL_ERROR("Unsupported C++ integer representation");
  120. return 0;
  121. }
  122. template <class Int>
  123. inline typename std::enable_if<
  124. std::is_integral<Int>::value &&
  125. std::is_signed<Int>::value &&
  126. sizeof(Int) == 8,
  127. int64_t>::type
  128. _Signed(Int x)
  129. {
  130. return x;
  131. }
  132. template <class T>
  133. inline char *_WriteBits(char *p, T val)
  134. {
  135. memcpy(p, &val, sizeof(val));
  136. return p + sizeof(val);
  137. }
  138. template <class T>
  139. inline T _ReadBits(char const *&p)
  140. {
  141. T ret;
  142. memcpy(&ret, p, sizeof(ret));
  143. p += sizeof(ret);
  144. return ret;
  145. }
  146. template <class Int>
  147. constexpr size_t
  148. _GetEncodedBufferSize(size_t numInts)
  149. {
  150. // Calculate encoded integer size.
  151. return numInts ?
  152. /* commonValue */ (sizeof(Int)) +
  153. /* numCodesBytes */ ((numInts * 2 + 7) / 8) +
  154. /* maxIntBytes */ (numInts * sizeof(Int))
  155. : 0;
  156. }
  157. template <class Int>
  158. struct _SmallTypes
  159. {
  160. typedef typename std::conditional<
  161. sizeof(Int) == 4, int8_t, int16_t>::type SmallInt;
  162. typedef typename std::conditional<
  163. sizeof(Int) == 4, int16_t, int32_t>::type MediumInt;
  164. };
  165. template <int N, class Iterator>
  166. void _EncodeNHelper(
  167. Iterator &cur,
  168. typename std::iterator_traits<Iterator>::value_type commonValue,
  169. typename std::make_signed<
  170. typename std::iterator_traits<Iterator>::value_type
  171. >::type &prevVal,
  172. char *&codesOut,
  173. char *&vintsOut) {
  174. using Int = typename std::iterator_traits<Iterator>::value_type;
  175. using SInt = typename std::make_signed<Int>::type;
  176. using SmallInt = typename _SmallTypes<Int>::SmallInt;
  177. using MediumInt = typename _SmallTypes<Int>::MediumInt;
  178. static_assert(1 <= N && N <= 4, "");
  179. enum Code { Common, Small, Medium, Large };
  180. auto getCode = [commonValue](SInt x) {
  181. std::numeric_limits<SmallInt> smallLimit;
  182. std::numeric_limits<MediumInt> mediumLimit;
  183. if (x == _Signed(commonValue)) { return Common; }
  184. if (x >= smallLimit.min() && x <= smallLimit.max()) { return Small; }
  185. if (x >= mediumLimit.min() && x <= mediumLimit.max()) { return Medium; }
  186. return Large;
  187. };
  188. uint8_t codeByte = 0;
  189. for (int i = 0; i != N; ++i) {
  190. SInt val = _Signed(*cur) - prevVal;
  191. prevVal = _Signed(*cur++);
  192. Code code = getCode(val);
  193. codeByte |= (code << (2 * i));
  194. switch (code) {
  195. default:
  196. case Common:
  197. break;
  198. case Small:
  199. vintsOut = _WriteBits(vintsOut, static_cast<SmallInt>(val));
  200. break;
  201. case Medium:
  202. vintsOut = _WriteBits(vintsOut, static_cast<MediumInt>(val));
  203. break;
  204. case Large:
  205. vintsOut = _WriteBits(vintsOut, _Signed(val));
  206. break;
  207. };
  208. }
  209. codesOut = _WriteBits(codesOut, codeByte);
  210. }
  211. template <int N, class Iterator>
  212. void _DecodeNHelper(
  213. char const *&codesIn,
  214. char const *&vintsIn,
  215. typename std::iterator_traits<Iterator>::value_type commonValue,
  216. typename std::make_signed<
  217. typename std::iterator_traits<Iterator>::value_type
  218. >::type &prevVal,
  219. Iterator &output)
  220. {
  221. using Int = typename std::iterator_traits<Iterator>::value_type;
  222. using SInt = typename std::make_signed<Int>::type;
  223. using SmallInt = typename _SmallTypes<Int>::SmallInt;
  224. using MediumInt = typename _SmallTypes<Int>::MediumInt;
  225. enum Code { Common, Small, Medium, Large };
  226. uint8_t codeByte = *codesIn++;
  227. for (int i = 0; i != N; ++i) {
  228. switch ((codeByte & (3 << (2 * i))) >> (2 * i)) {
  229. default:
  230. case Common:
  231. prevVal += commonValue;
  232. break;
  233. case Small:
  234. prevVal += _ReadBits<SmallInt>(vintsIn);
  235. break;
  236. case Medium:
  237. prevVal += _ReadBits<MediumInt>(vintsIn);
  238. break;
  239. case Large:
  240. prevVal += _ReadBits<SInt>(vintsIn);
  241. break;
  242. }
  243. *output++ = static_cast<Int>(prevVal);
  244. }
  245. }
  246. template <class Int>
  247. size_t
  248. _EncodeIntegers(Int const *begin, size_t numInts, char *output)
  249. {
  250. using SInt = typename std::make_signed<Int>::type;
  251. if (numInts == 0)
  252. return 0;
  253. // First find the most common element value.
  254. SInt commonValue = 0;
  255. {
  256. size_t commonCount = 0;
  257. std::unordered_map<SInt, size_t> counts;
  258. SInt prevVal = 0;
  259. for (Int const *cur = begin, *end = begin + numInts;
  260. cur != end; ++cur) {
  261. SInt val = _Signed(*cur) - prevVal;
  262. const size_t count = ++counts[val];
  263. if (count > commonCount) {
  264. commonValue = val;
  265. commonCount = count;
  266. } else if (count == commonCount && val > commonValue) {
  267. // Take the largest common value in case of a tie -- this gives
  268. // the biggest potential savings in the encoded stream.
  269. commonValue = val;
  270. }
  271. prevVal = _Signed(*cur);
  272. }
  273. }
  274. // Now code the values.
  275. // Write most common value.
  276. char *p = _WriteBits(output, commonValue);
  277. char *codesOut = p;
  278. char *vintsOut = p + (numInts * 2 + 7) / 8;
  279. Int const *cur = begin;
  280. SInt prevVal = 0;
  281. while (numInts >= 4) {
  282. _EncodeNHelper<4>(cur, commonValue, prevVal, codesOut, vintsOut);
  283. numInts -= 4;
  284. }
  285. switch (numInts) {
  286. case 0: default: break;
  287. case 1: _EncodeNHelper<1>(cur, commonValue, prevVal, codesOut, vintsOut);
  288. break;
  289. case 2: _EncodeNHelper<2>(cur, commonValue, prevVal, codesOut, vintsOut);
  290. break;
  291. case 3: _EncodeNHelper<3>(cur, commonValue, prevVal, codesOut, vintsOut);
  292. break;
  293. };
  294. return vintsOut - output;
  295. }
  296. template <class Int>
  297. size_t _DecodeIntegers(char const *data, size_t numInts, Int *result)
  298. {
  299. using SInt = typename std::make_signed<Int>::type;
  300. auto commonValue = _ReadBits<SInt>(data);
  301. size_t numCodesBytes = (numInts * 2 + 7) / 8;
  302. char const *codesIn = data;
  303. char const *vintsIn = data + numCodesBytes;
  304. SInt prevVal = 0;
  305. auto intsLeft = numInts;
  306. while (intsLeft >= 4) {
  307. _DecodeNHelper<4>(codesIn, vintsIn, commonValue, prevVal, result);
  308. intsLeft -= 4;
  309. }
  310. switch (intsLeft) {
  311. case 0: default: break;
  312. case 1: _DecodeNHelper<1>(codesIn, vintsIn, commonValue, prevVal, result);
  313. break;
  314. case 2: _DecodeNHelper<2>(codesIn, vintsIn, commonValue, prevVal, result);
  315. break;
  316. case 3: _DecodeNHelper<3>(codesIn, vintsIn, commonValue, prevVal, result);
  317. break;
  318. };
  319. return numInts;
  320. }
  321. template <class Int>
  322. size_t
  323. _CompressIntegers(Int const *begin, size_t numInts, char *output, std::string *err)
  324. {
  325. // Working space.
  326. std::unique_ptr<char[]>
  327. encodeBuffer(new char[_GetEncodedBufferSize<Int>(numInts)]);
  328. // Encode first.
  329. size_t encodedSize = _EncodeIntegers(begin, numInts, encodeBuffer.get());
  330. // Then compress.
  331. return LZ4Compression::CompressToBuffer(
  332. encodeBuffer.get(), output, encodedSize, err);
  333. }
  334. template <class Int>
  335. size_t _DecompressIntegers(char const *compressed, size_t compressedSize,
  336. Int *ints, size_t numInts, std::string *err, char *workingSpace)
  337. {
  338. // Working space.
  339. size_t workingSpaceSize =
  340. Usd_IntegerCompression::GetDecompressionWorkingSpaceSize(numInts);
  341. std::unique_ptr<char[]> tmpSpace;
  342. if (!workingSpace) {
  343. tmpSpace.reset(new char[workingSpaceSize]);
  344. workingSpace = tmpSpace.get();
  345. }
  346. size_t decompSz = LZ4Compression::DecompressFromBuffer(
  347. compressed, workingSpace, compressedSize, workingSpaceSize, err);
  348. if (decompSz == 0)
  349. return 0;
  350. return _DecodeIntegers(workingSpace, numInts, ints);
  351. }
  352. } // anon
  353. ////////////////////////////////////////////////////////////////////////
  354. // 32 bit.
  355. size_t
  356. Usd_IntegerCompression::GetCompressedBufferSize(size_t numInts)
  357. {
  358. return LZ4Compression::GetCompressedBufferSize(
  359. _GetEncodedBufferSize<int32_t>(numInts));
  360. }
  361. size_t
  362. Usd_IntegerCompression::GetDecompressionWorkingSpaceSize(size_t numInts)
  363. {
  364. return _GetEncodedBufferSize<int32_t>(numInts);
  365. }
  366. size_t
  367. Usd_IntegerCompression::CompressToBuffer(
  368. int32_t const *ints, size_t numInts, char *compressed, std::string *err)
  369. {
  370. return _CompressIntegers(ints, numInts, compressed, err);
  371. }
  372. size_t
  373. Usd_IntegerCompression::CompressToBuffer(
  374. uint32_t const *ints, size_t numInts, char *compressed, std::string *err)
  375. {
  376. return _CompressIntegers(ints, numInts, compressed, err);
  377. }
  378. size_t
  379. Usd_IntegerCompression::DecompressFromBuffer(
  380. char const *compressed, size_t compressedSize,
  381. int32_t *ints, size_t numInts, std::string *err, char *workingSpace)
  382. {
  383. return _DecompressIntegers(compressed, compressedSize,
  384. ints, numInts, err, workingSpace);
  385. }
  386. size_t
  387. Usd_IntegerCompression::DecompressFromBuffer(
  388. char const *compressed, size_t compressedSize,
  389. uint32_t *ints, size_t numInts, std::string *err, char *workingSpace)
  390. {
  391. return _DecompressIntegers(compressed, compressedSize,
  392. ints, numInts, err, workingSpace);
  393. }
  394. ////////////////////////////////////////////////////////////////////////
  395. // 64 bit.
  396. size_t
  397. Usd_IntegerCompression64::GetCompressedBufferSize(size_t numInts)
  398. {
  399. return LZ4Compression::GetCompressedBufferSize(
  400. _GetEncodedBufferSize<int64_t>(numInts));
  401. }
  402. size_t
  403. Usd_IntegerCompression64::GetDecompressionWorkingSpaceSize(size_t numInts)
  404. {
  405. return _GetEncodedBufferSize<int64_t>(numInts);
  406. }
  407. size_t
  408. Usd_IntegerCompression64::CompressToBuffer(
  409. int64_t const *ints, size_t numInts, char *compressed, std::string *err)
  410. {
  411. return _CompressIntegers(ints, numInts, compressed, err);
  412. }
  413. size_t
  414. Usd_IntegerCompression64::CompressToBuffer(
  415. uint64_t const *ints, size_t numInts, char *compressed, std::string *err)
  416. {
  417. return _CompressIntegers(ints, numInts, compressed, err);
  418. }
  419. size_t
  420. Usd_IntegerCompression64::DecompressFromBuffer(
  421. char const *compressed, size_t compressedSize,
  422. int64_t *ints, size_t numInts, std::string *err, char *workingSpace)
  423. {
  424. return _DecompressIntegers(compressed, compressedSize,
  425. ints, numInts, err, workingSpace);
  426. }
  427. size_t
  428. Usd_IntegerCompression64::DecompressFromBuffer(
  429. char const *compressed, size_t compressedSize,
  430. uint64_t *ints, size_t numInts, std::string *err, char *workingSpace)
  431. {
  432. return _DecompressIntegers(compressed, compressedSize,
  433. ints, numInts, err, workingSpace);
  434. }
  435. //PXR_NAMESPACE_CLOSE_SCOPE
  436. } // namespace tinyusdz