zstdseek_decompress.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. /*
  2. * Copyright (c) 2017-present, Facebook, Inc.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. /* *********************************************************
  11. * Turn on Large Files support (>4GB) for 32-bit Linux/Unix
  12. ***********************************************************/
  13. #if !defined(__64BIT__) || defined(__MINGW32__) /* No point defining Large file for 64 bit but MinGW-w64 requires it */
  14. # if !defined(_FILE_OFFSET_BITS)
  15. # define _FILE_OFFSET_BITS 64 /* turn off_t into a 64-bit type for ftello, fseeko */
  16. # endif
  17. # if !defined(_LARGEFILE_SOURCE) /* obsolete macro, replaced with _FILE_OFFSET_BITS */
  18. # define _LARGEFILE_SOURCE 1 /* Large File Support extension (LFS) - fseeko, ftello */
  19. # endif
  20. # if defined(_AIX) || defined(__hpux)
  21. # define _LARGE_FILES /* Large file support on 32-bits AIX and HP-UX */
  22. # endif
  23. #endif
  24. /* ************************************************************
  25. * Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
  26. ***************************************************************/
  27. #if defined(_MSC_VER) && _MSC_VER >= 1400
  28. # define LONG_SEEK _fseeki64
  29. #elif !defined(__64BIT__) && (PLATFORM_POSIX_VERSION >= 200112L) /* No point defining Large file for 64 bit */
  30. # define LONG_SEEK fseeko
  31. #elif defined(__MINGW32__) && !defined(__STRICT_ANSI__) && !defined(__NO_MINGW_LFS) && defined(__MSVCRT__)
  32. # define LONG_SEEK fseeko64
  33. #elif defined(_WIN32) && !defined(__DJGPP__)
  34. # include <windows.h>
  35. static int LONG_SEEK(FILE* file, __int64 offset, int origin) {
  36. LARGE_INTEGER off;
  37. DWORD method;
  38. off.QuadPart = offset;
  39. if (origin == SEEK_END)
  40. method = FILE_END;
  41. else if (origin == SEEK_CUR)
  42. method = FILE_CURRENT;
  43. else
  44. method = FILE_BEGIN;
  45. if (SetFilePointerEx((HANDLE) _get_osfhandle(_fileno(file)), off, NULL, method))
  46. return 0;
  47. else
  48. return -1;
  49. }
  50. #else
  51. # define LONG_SEEK fseek
  52. #endif
  53. #include <stdlib.h> /* malloc, free */
  54. #include <stdio.h> /* FILE* */
  55. #include <limits.h> /* UNIT_MAX */
  56. #include <assert.h>
  57. #define XXH_STATIC_LINKING_ONLY
  58. #include "xxhash.h"
  59. #define ZSTD_STATIC_LINKING_ONLY
  60. #include "zstd.h"
  61. #include "zstd_errors.h"
  62. #include "mem.h"
  63. #include "zstd_seekable.h"
  64. #undef ERROR
  65. #define ERROR(name) ((size_t)-ZSTD_error_##name)
  66. #define CHECK_IO(f) { int const errcod = (f); if (errcod < 0) return ERROR(seekableIO); }
  67. #undef MIN
  68. #undef MAX
  69. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  70. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  71. #define ZSTD_SEEKABLE_NO_OUTPUT_PROGRESS_MAX 16
  72. /* Special-case callbacks for FILE* and in-memory modes, so that we can treat
  73. * them the same way as the advanced API */
  74. static int ZSTD_seekable_read_FILE(void* opaque, void* buffer, size_t n)
  75. {
  76. size_t const result = fread(buffer, 1, n, (FILE*)opaque);
  77. if (result != n) {
  78. return -1;
  79. }
  80. return 0;
  81. }
  82. static int ZSTD_seekable_seek_FILE(void* opaque, long long offset, int origin)
  83. {
  84. int const ret = LONG_SEEK((FILE*)opaque, offset, origin);
  85. if (ret) return ret;
  86. return fflush((FILE*)opaque);
  87. }
  88. typedef struct {
  89. const void *ptr;
  90. size_t size;
  91. size_t pos;
  92. } buffWrapper_t;
  93. static int ZSTD_seekable_read_buff(void* opaque, void* buffer, size_t n)
  94. {
  95. buffWrapper_t* const buff = (buffWrapper_t*)opaque;
  96. assert(buff != NULL);
  97. if (buff->pos + n > buff->size) return -1;
  98. memcpy(buffer, (const BYTE*)buff->ptr + buff->pos, n);
  99. buff->pos += n;
  100. return 0;
  101. }
  102. static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin)
  103. {
  104. buffWrapper_t* const buff = (buffWrapper_t*) opaque;
  105. unsigned long long newOffset;
  106. assert(buff != NULL);
  107. switch (origin) {
  108. case SEEK_SET:
  109. assert(offset >= 0);
  110. newOffset = (unsigned long long)offset;
  111. break;
  112. case SEEK_CUR:
  113. newOffset = (unsigned long long)((long long)buff->pos + offset);
  114. break;
  115. case SEEK_END:
  116. newOffset = (unsigned long long)((long long)buff->size + offset);
  117. break;
  118. default:
  119. assert(0); /* not possible */
  120. }
  121. if (newOffset > buff->size) {
  122. return -1;
  123. }
  124. buff->pos = newOffset;
  125. return 0;
  126. }
  127. typedef struct {
  128. U64 cOffset;
  129. U64 dOffset;
  130. U32 checksum;
  131. } seekEntry_t;
  132. struct ZSTD_seekTable_s {
  133. seekEntry_t* entries;
  134. size_t tableLen;
  135. int checksumFlag;
  136. };
  137. #define SEEKABLE_BUFF_SIZE ZSTD_BLOCKSIZE_MAX
  138. struct ZSTD_seekable_s {
  139. ZSTD_DStream* dstream;
  140. ZSTD_seekTable seekTable;
  141. ZSTD_seekable_customFile src;
  142. U64 decompressedOffset;
  143. U32 curFrame;
  144. BYTE inBuff[SEEKABLE_BUFF_SIZE]; /* need to do our own input buffering */
  145. BYTE outBuff[SEEKABLE_BUFF_SIZE]; /* so we can efficiently decompress the
  146. starts of chunks before we get to the
  147. desired section */
  148. ZSTD_inBuffer in; /* maintain continuity across ZSTD_seekable_decompress operations */
  149. buffWrapper_t buffWrapper; /* for `src.opaque` in in-memory mode */
  150. XXH64_state_t xxhState;
  151. };
  152. ZSTD_seekable* ZSTD_seekable_create(void)
  153. {
  154. ZSTD_seekable* const zs = (ZSTD_seekable*)malloc(sizeof(ZSTD_seekable));
  155. if (zs == NULL) return NULL;
  156. /* also initializes stage to zsds_init */
  157. memset(zs, 0, sizeof(*zs));
  158. zs->dstream = ZSTD_createDStream();
  159. if (zs->dstream == NULL) {
  160. free(zs);
  161. return NULL;
  162. }
  163. return zs;
  164. }
  165. size_t ZSTD_seekable_free(ZSTD_seekable* zs)
  166. {
  167. if (zs == NULL) return 0; /* support free on null */
  168. ZSTD_freeDStream(zs->dstream);
  169. free(zs->seekTable.entries);
  170. free(zs);
  171. return 0;
  172. }
  173. ZSTD_seekTable* ZSTD_seekTable_create_fromSeekable(const ZSTD_seekable* zs)
  174. {
  175. ZSTD_seekTable* const st = (ZSTD_seekTable*)malloc(sizeof(ZSTD_seekTable));
  176. if (st==NULL) return NULL;
  177. st->checksumFlag = zs->seekTable.checksumFlag;
  178. st->tableLen = zs->seekTable.tableLen;
  179. /* Allocate an extra entry at the end to match logic of initial allocation */
  180. size_t const entriesSize = sizeof(seekEntry_t) * (zs->seekTable.tableLen + 1);
  181. seekEntry_t* const entries = (seekEntry_t*)malloc(entriesSize);
  182. if (entries==NULL) {
  183. free(st);
  184. return NULL;
  185. }
  186. memcpy(entries, zs->seekTable.entries, entriesSize);
  187. st->entries = entries;
  188. return st;
  189. }
  190. size_t ZSTD_seekTable_free(ZSTD_seekTable* st)
  191. {
  192. if (st == NULL) return 0; /* support free on null */
  193. free(st->entries);
  194. free(st);
  195. return 0;
  196. }
  197. /** ZSTD_seekable_offsetToFrameIndex() :
  198. * Performs a binary search to find the last frame with a decompressed offset
  199. * <= pos
  200. * @return : the frame's index */
  201. unsigned ZSTD_seekable_offsetToFrameIndex(const ZSTD_seekable* zs, unsigned long long pos)
  202. {
  203. return ZSTD_seekTable_offsetToFrameIndex(&zs->seekTable, pos);
  204. }
  205. unsigned ZSTD_seekTable_offsetToFrameIndex(const ZSTD_seekTable* st, unsigned long long pos)
  206. {
  207. U32 lo = 0;
  208. U32 hi = (U32)st->tableLen;
  209. assert(st->tableLen <= UINT_MAX);
  210. if (pos >= st->entries[st->tableLen].dOffset) {
  211. return (unsigned)st->tableLen;
  212. }
  213. while (lo + 1 < hi) {
  214. U32 const mid = lo + ((hi - lo) >> 1);
  215. if (st->entries[mid].dOffset <= pos) {
  216. lo = mid;
  217. } else {
  218. hi = mid;
  219. }
  220. }
  221. return lo;
  222. }
  223. unsigned ZSTD_seekable_getNumFrames(const ZSTD_seekable* zs)
  224. {
  225. return ZSTD_seekTable_getNumFrames(&zs->seekTable);
  226. }
  227. unsigned ZSTD_seekTable_getNumFrames(const ZSTD_seekTable* st)
  228. {
  229. assert(st->tableLen <= UINT_MAX);
  230. return (unsigned)st->tableLen;
  231. }
  232. unsigned long long ZSTD_seekable_getFrameCompressedOffset(const ZSTD_seekable* zs, unsigned frameIndex)
  233. {
  234. return ZSTD_seekTable_getFrameCompressedOffset(&zs->seekTable, frameIndex);
  235. }
  236. unsigned long long ZSTD_seekTable_getFrameCompressedOffset(const ZSTD_seekTable* st, unsigned frameIndex)
  237. {
  238. if (frameIndex >= st->tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE;
  239. return st->entries[frameIndex].cOffset;
  240. }
  241. unsigned long long ZSTD_seekable_getFrameDecompressedOffset(const ZSTD_seekable* zs, unsigned frameIndex)
  242. {
  243. return ZSTD_seekTable_getFrameDecompressedOffset(&zs->seekTable, frameIndex);
  244. }
  245. unsigned long long ZSTD_seekTable_getFrameDecompressedOffset(const ZSTD_seekTable* st, unsigned frameIndex)
  246. {
  247. if (frameIndex >= st->tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE;
  248. return st->entries[frameIndex].dOffset;
  249. }
  250. size_t ZSTD_seekable_getFrameCompressedSize(const ZSTD_seekable* zs, unsigned frameIndex)
  251. {
  252. return ZSTD_seekTable_getFrameCompressedSize(&zs->seekTable, frameIndex);
  253. }
  254. size_t ZSTD_seekTable_getFrameCompressedSize(const ZSTD_seekTable* st, unsigned frameIndex)
  255. {
  256. if (frameIndex >= st->tableLen) return ERROR(frameIndex_tooLarge);
  257. return st->entries[frameIndex + 1].cOffset -
  258. st->entries[frameIndex].cOffset;
  259. }
  260. size_t ZSTD_seekable_getFrameDecompressedSize(const ZSTD_seekable* zs, unsigned frameIndex)
  261. {
  262. return ZSTD_seekTable_getFrameDecompressedSize(&zs->seekTable, frameIndex);
  263. }
  264. size_t ZSTD_seekTable_getFrameDecompressedSize(const ZSTD_seekTable* st, unsigned frameIndex)
  265. {
  266. if (frameIndex > st->tableLen) return ERROR(frameIndex_tooLarge);
  267. return st->entries[frameIndex + 1].dOffset -
  268. st->entries[frameIndex].dOffset;
  269. }
  270. static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs)
  271. {
  272. int checksumFlag;
  273. ZSTD_seekable_customFile src = zs->src;
  274. /* read the footer, fixed size */
  275. CHECK_IO(src.seek(src.opaque, -(int)ZSTD_seekTableFooterSize, SEEK_END));
  276. CHECK_IO(src.read(src.opaque, zs->inBuff, ZSTD_seekTableFooterSize));
  277. if (MEM_readLE32(zs->inBuff + 5) != ZSTD_SEEKABLE_MAGICNUMBER) {
  278. return ERROR(prefix_unknown);
  279. }
  280. { BYTE const sfd = zs->inBuff[4];
  281. checksumFlag = sfd >> 7;
  282. /* check reserved bits */
  283. if ((sfd >> 2) & 0x1f) {
  284. return ERROR(corruption_detected);
  285. } }
  286. { U32 const numFrames = MEM_readLE32(zs->inBuff);
  287. U32 const sizePerEntry = 8 + (checksumFlag?4:0);
  288. U32 const tableSize = sizePerEntry * numFrames;
  289. U32 const frameSize = tableSize + ZSTD_seekTableFooterSize + ZSTD_SKIPPABLEHEADERSIZE;
  290. U32 remaining = frameSize - ZSTD_seekTableFooterSize; /* don't need to re-read footer */
  291. { U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE);
  292. CHECK_IO(src.seek(src.opaque, -(S64)frameSize, SEEK_END));
  293. CHECK_IO(src.read(src.opaque, zs->inBuff, toRead));
  294. remaining -= toRead;
  295. }
  296. if (MEM_readLE32(zs->inBuff) != (ZSTD_MAGIC_SKIPPABLE_START | 0xE)) {
  297. return ERROR(prefix_unknown);
  298. }
  299. if (MEM_readLE32(zs->inBuff+4) + ZSTD_SKIPPABLEHEADERSIZE != frameSize) {
  300. return ERROR(prefix_unknown);
  301. }
  302. { /* Allocate an extra entry at the end so that we can do size
  303. * computations on the last element without special case */
  304. seekEntry_t* const entries = (seekEntry_t*)malloc(sizeof(seekEntry_t) * (numFrames + 1));
  305. U32 idx = 0;
  306. U32 pos = 8;
  307. U64 cOffset = 0;
  308. U64 dOffset = 0;
  309. if (entries == NULL) return ERROR(memory_allocation);
  310. /* compute cumulative positions */
  311. for (; idx < numFrames; idx++) {
  312. if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) {
  313. U32 const offset = SEEKABLE_BUFF_SIZE - pos;
  314. U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE - offset);
  315. memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */
  316. CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead));
  317. remaining -= toRead;
  318. pos = 0;
  319. }
  320. entries[idx].cOffset = cOffset;
  321. entries[idx].dOffset = dOffset;
  322. cOffset += MEM_readLE32(zs->inBuff + pos);
  323. pos += 4;
  324. dOffset += MEM_readLE32(zs->inBuff + pos);
  325. pos += 4;
  326. if (checksumFlag) {
  327. entries[idx].checksum = MEM_readLE32(zs->inBuff + pos);
  328. pos += 4;
  329. }
  330. }
  331. entries[numFrames].cOffset = cOffset;
  332. entries[numFrames].dOffset = dOffset;
  333. zs->seekTable.entries = entries;
  334. zs->seekTable.tableLen = numFrames;
  335. zs->seekTable.checksumFlag = checksumFlag;
  336. return 0;
  337. }
  338. }
  339. }
  340. size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, size_t srcSize)
  341. {
  342. zs->buffWrapper = (buffWrapper_t){src, srcSize, 0};
  343. { ZSTD_seekable_customFile srcFile = {&zs->buffWrapper,
  344. &ZSTD_seekable_read_buff,
  345. &ZSTD_seekable_seek_buff};
  346. return ZSTD_seekable_initAdvanced(zs, srcFile); }
  347. }
  348. size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src)
  349. {
  350. ZSTD_seekable_customFile srcFile = {src, &ZSTD_seekable_read_FILE,
  351. &ZSTD_seekable_seek_FILE};
  352. return ZSTD_seekable_initAdvanced(zs, srcFile);
  353. }
  354. size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile src)
  355. {
  356. zs->src = src;
  357. { const size_t seekTableInit = ZSTD_seekable_loadSeekTable(zs);
  358. if (ZSTD_isError(seekTableInit)) return seekTableInit; }
  359. zs->decompressedOffset = (U64)-1;
  360. zs->curFrame = (U32)-1;
  361. { const size_t dstreamInit = ZSTD_initDStream(zs->dstream);
  362. if (ZSTD_isError(dstreamInit)) return dstreamInit; }
  363. return 0;
  364. }
  365. size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t len, unsigned long long offset)
  366. {
  367. unsigned long long const eos = zs->seekTable.entries[zs->seekTable.tableLen].dOffset;
  368. if (offset + len > eos) {
  369. len = eos - offset;
  370. }
  371. U32 targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, offset);
  372. U32 noOutputProgressCount = 0;
  373. size_t srcBytesRead = 0;
  374. do {
  375. /* check if we can continue from a previous decompress job */
  376. if (targetFrame != zs->curFrame || offset != zs->decompressedOffset) {
  377. zs->decompressedOffset = zs->seekTable.entries[targetFrame].dOffset;
  378. zs->curFrame = targetFrame;
  379. assert(zs->seekTable.entries[targetFrame].cOffset < LLONG_MAX);
  380. CHECK_IO(zs->src.seek(zs->src.opaque,
  381. (long long)zs->seekTable.entries[targetFrame].cOffset,
  382. SEEK_SET));
  383. zs->in = (ZSTD_inBuffer){zs->inBuff, 0, 0};
  384. XXH64_reset(&zs->xxhState, 0);
  385. ZSTD_DCtx_reset(zs->dstream, ZSTD_reset_session_only);
  386. if (zs->buffWrapper.size && srcBytesRead > zs->buffWrapper.size) {
  387. return ERROR(seekableIO);
  388. }
  389. }
  390. while (zs->decompressedOffset < offset + len) {
  391. size_t toRead;
  392. ZSTD_outBuffer outTmp;
  393. size_t prevOutPos;
  394. size_t prevInPos;
  395. size_t forwardProgress;
  396. if (zs->decompressedOffset < offset) {
  397. /* dummy decompressions until we get to the target offset */
  398. outTmp = (ZSTD_outBuffer){zs->outBuff, MIN(SEEKABLE_BUFF_SIZE, offset - zs->decompressedOffset), 0};
  399. } else {
  400. outTmp = (ZSTD_outBuffer){dst, len, zs->decompressedOffset - offset};
  401. }
  402. prevOutPos = outTmp.pos;
  403. prevInPos = zs->in.pos;
  404. toRead = ZSTD_decompressStream(zs->dstream, &outTmp, &zs->in);
  405. if (ZSTD_isError(toRead)) {
  406. return toRead;
  407. }
  408. if (zs->seekTable.checksumFlag) {
  409. XXH64_update(&zs->xxhState, (BYTE*)outTmp.dst + prevOutPos,
  410. outTmp.pos - prevOutPos);
  411. }
  412. forwardProgress = outTmp.pos - prevOutPos;
  413. if (forwardProgress == 0) {
  414. if (noOutputProgressCount++ > ZSTD_SEEKABLE_NO_OUTPUT_PROGRESS_MAX) {
  415. return ERROR(seekableIO);
  416. }
  417. } else {
  418. noOutputProgressCount = 0;
  419. }
  420. zs->decompressedOffset += forwardProgress;
  421. srcBytesRead += zs->in.pos - prevInPos;
  422. if (toRead == 0) {
  423. /* frame complete */
  424. /* verify checksum */
  425. if (zs->seekTable.checksumFlag &&
  426. (XXH64_digest(&zs->xxhState) & 0xFFFFFFFFU) !=
  427. zs->seekTable.entries[targetFrame].checksum) {
  428. return ERROR(corruption_detected);
  429. }
  430. if (zs->decompressedOffset < offset + len) {
  431. /* go back to the start and force a reset of the stream */
  432. targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, zs->decompressedOffset);
  433. /* in this case it will fail later with corruption_detected, since last block does not have checksum */
  434. assert(targetFrame != zs->seekTable.tableLen);
  435. }
  436. break;
  437. }
  438. /* read in more data if we're done with this buffer */
  439. if (zs->in.pos == zs->in.size) {
  440. toRead = MIN(toRead, SEEKABLE_BUFF_SIZE);
  441. CHECK_IO(zs->src.read(zs->src.opaque, zs->inBuff, toRead));
  442. zs->in.size = toRead;
  443. zs->in.pos = 0;
  444. }
  445. } /* while (zs->decompressedOffset < offset + len) */
  446. } while (zs->decompressedOffset != offset + len);
  447. return len;
  448. }
  449. size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned frameIndex)
  450. {
  451. if (frameIndex >= zs->seekTable.tableLen) {
  452. return ERROR(frameIndex_tooLarge);
  453. }
  454. { size_t const decompressedSize =
  455. zs->seekTable.entries[frameIndex + 1].dOffset -
  456. zs->seekTable.entries[frameIndex].dOffset;
  457. if (dstSize < decompressedSize) {
  458. return ERROR(dstSize_tooSmall);
  459. }
  460. return ZSTD_seekable_decompress(
  461. zs, dst, decompressedSize,
  462. zs->seekTable.entries[frameIndex].dOffset);
  463. }
  464. }