gznorm.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. /* gznorm.c -- normalize a gzip stream
  2. * Copyright (C) 2018 Mark Adler
  3. * For conditions of distribution and use, see copyright notice in zlib.h
  4. * Version 1.0 7 Oct 2018 Mark Adler */
  5. // gznorm takes a gzip stream, potentially containing multiple members, and
  6. // converts it to a gzip stream with a single member. In addition the gzip
  7. // header is normalized, removing the file name and time stamp, and setting the
  8. // other header contents (XFL, OS) to fixed values. gznorm does not recompress
  9. // the data, so it is fast, but no advantage is gained from the history that
  10. // could be available across member boundaries.
  11. #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf,
  12. // vsnprintf, stdout, stderr, NULL, FILE
  13. #include <stdlib.h> // malloc, free
  14. #include <string.h> // strerror
  15. #include <errno.h> // errno
  16. #include <stdarg.h> // va_list, va_start, va_end
  17. #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd,
  18. // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
  19. // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
  20. // Z_MEM_ERROR
  21. #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
  22. # include <fcntl.h>
  23. # include <io.h>
  24. # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
  25. #else
  26. # define SET_BINARY_MODE(file)
  27. #endif
  28. #define local static
  29. // printf to an allocated string. Return the string, or NULL if the printf or
  30. // allocation fails.
  31. local char *aprintf(char *fmt, ...) {
  32. // Get the length of the result of the printf.
  33. va_list args;
  34. va_start(args, fmt);
  35. int len = vsnprintf(NULL, 0, fmt, args);
  36. va_end(args);
  37. if (len < 0)
  38. return NULL;
  39. // Allocate the required space and printf to it.
  40. char *str = malloc(len + 1);
  41. if (str == NULL)
  42. return NULL;
  43. va_start(args, fmt);
  44. vsnprintf(str, len + 1, fmt, args);
  45. va_end(args);
  46. return str;
  47. }
  48. // Return with an error, putting an allocated error message in *err. Doing an
  49. // inflateEnd() on an already ended state, or one with state set to Z_NULL, is
  50. // permitted.
  51. #define BYE(...) \
  52. do { \
  53. inflateEnd(&strm); \
  54. *err = aprintf(__VA_ARGS__); \
  55. return 1; \
  56. } while (0)
  57. // Chunk size for buffered reads and for decompression. Twice this many bytes
  58. // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
  59. #define CHUNK 16384
  60. // Read a gzip stream from in and write an equivalent normalized gzip stream to
  61. // out. If given no input, an empty gzip stream will be written. If successful,
  62. // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
  63. // details of the error are returned in *err, a pointer to an allocated string.
  64. //
  65. // The input may be a stream with multiple gzip members, which is converted to
  66. // a single gzip member on the output. Each gzip member is decompressed at the
  67. // level of deflate blocks. This enables clearing the last-block bit, shifting
  68. // the compressed data to concatenate to the previous member's compressed data,
  69. // which can end at an arbitrary bit boundary, and identifying stored blocks in
  70. // order to resynchronize those to byte boundaries. The deflate compressed data
  71. // is terminated with a 10-bit empty fixed block. If any members on the input
  72. // end with a 10-bit empty fixed block, then that block is excised from the
  73. // stream. This avoids appending empty fixed blocks for every normalization,
  74. // and assures that gzip_normalize applied a second time will not change the
  75. // input. The pad bits after stored block headers and after the final deflate
  76. // block are all forced to zeros.
  77. local int gzip_normalize(FILE *in, FILE *out, char **err) {
  78. // initialize the inflate engine to process a gzip member
  79. z_stream strm;
  80. strm.zalloc = Z_NULL;
  81. strm.zfree = Z_NULL;
  82. strm.opaque = Z_NULL;
  83. strm.avail_in = 0;
  84. strm.next_in = Z_NULL;
  85. if (inflateInit2(&strm, 15 + 16) != Z_OK)
  86. BYE("out of memory");
  87. // State while processing the input gzip stream.
  88. enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
  89. BETWEEN, // between gzip members (must end in this state)
  90. HEAD, // reading a gzip header
  91. BLOCK, // reading deflate blocks
  92. TAIL // reading a gzip trailer
  93. } state = BETWEEN; // current component being processed
  94. unsigned long crc = 0; // accumulated CRC of uncompressed data
  95. unsigned long len = 0; // accumulated length of uncompressed data
  96. unsigned long buf = 0; // deflate stream bit buffer of num bits
  97. int num = 0; // number of bits in buf (at bottom)
  98. // Write a canonical gzip header (no mod time, file name, comment, extra
  99. // block, or extra flags, and OS is marked as unknown).
  100. fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
  101. // Process the gzip stream from in until reaching the end of the input,
  102. // encountering invalid input, or experiencing an i/o error.
  103. int more; // true if not at the end of the input
  104. do {
  105. // State inside this loop.
  106. unsigned char *put; // next input buffer location to process
  107. int prev; // number of bits from previous block in
  108. // the bit buffer, or -1 if not at the
  109. // start of a block
  110. unsigned long long memb; // uncompressed length of member
  111. size_t tail; // number of trailer bytes read (0..8)
  112. unsigned long part; // accumulated trailer component
  113. // Get the next chunk of input from in.
  114. unsigned char dat[CHUNK];
  115. strm.avail_in = fread(dat, 1, CHUNK, in);
  116. if (strm.avail_in == 0)
  117. break;
  118. more = strm.avail_in == CHUNK;
  119. strm.next_in = put = dat;
  120. // Run that chunk of input through the inflate engine to exhaustion.
  121. do {
  122. // At this point it is assured that strm.avail_in > 0.
  123. // Inflate until the end of a gzip component (header, deflate
  124. // block, trailer) is reached, or until all of the chunk is
  125. // consumed. The resulting decompressed data is discarded, though
  126. // the total size of the decompressed data in each member is
  127. // tracked, for the calculation of the total CRC.
  128. do {
  129. // inflate and handle any errors
  130. unsigned char scrap[CHUNK];
  131. strm.avail_out = CHUNK;
  132. strm.next_out = scrap;
  133. int ret = inflate(&strm, Z_BLOCK);
  134. if (ret == Z_MEM_ERROR)
  135. BYE("out of memory");
  136. if (ret == Z_DATA_ERROR)
  137. BYE("input invalid: %s", strm.msg);
  138. if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
  139. BYE("internal error");
  140. // Update the number of uncompressed bytes generated in this
  141. // member. The actual count (not modulo 2^32) is required to
  142. // correctly compute the total CRC.
  143. unsigned got = CHUNK - strm.avail_out;
  144. memb += got;
  145. if (memb < got)
  146. BYE("overflow error");
  147. // Continue to process this chunk until it is consumed, or
  148. // until the end of a component (header, deflate block, or
  149. // trailer) is reached.
  150. } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
  151. // Since strm.avail_in was > 0 for the inflate call, some input was
  152. // just consumed. It is therefore assured that put < strm.next_in.
  153. // Disposition the consumed component or part of a component.
  154. switch (state) {
  155. case BETWEEN:
  156. state = HEAD;
  157. // Fall through to HEAD when some or all of the header is
  158. // processed.
  159. case HEAD:
  160. // Discard the header.
  161. if (strm.data_type & 0x80) {
  162. // End of header reached -- deflate blocks follow.
  163. put = strm.next_in;
  164. prev = num;
  165. memb = 0;
  166. state = BLOCK;
  167. }
  168. break;
  169. case BLOCK:
  170. // Copy the deflate stream to the output, but with the
  171. // last-block-bit cleared. Re-synchronize stored block
  172. // headers to the output byte boundaries. The bytes at
  173. // put..strm.next_in-1 is the compressed data that has been
  174. // processed and is ready to be copied to the output.
  175. // At this point, it is assured that new compressed data is
  176. // available, i.e., put < strm.next_in. If prev is -1, then
  177. // that compressed data starts in the middle of a deflate
  178. // block. If prev is not -1, then the bits in the bit
  179. // buffer, possibly combined with the bits in *put, contain
  180. // the three-bit header of the new deflate block. In that
  181. // case, prev is the number of bits from the previous block
  182. // that remain in the bit buffer. Since num is the number
  183. // of bits in the bit buffer, we have that num - prev is
  184. // the number of bits from the new block currently in the
  185. // bit buffer.
  186. // If strm.data_type & 0xc0 is 0x80, then the last byte of
  187. // the available compressed data includes the last bits of
  188. // the end of a deflate block. In that case, that last byte
  189. // also has strm.data_type & 0x1f bits of the next deflate
  190. // block, in the range 0..7. If strm.data_type & 0xc0 is
  191. // 0xc0, then the last byte of the compressed data is the
  192. // end of the deflate stream, followed by strm.data_type &
  193. // 0x1f pad bits, also in the range 0..7.
  194. // Set bits to the number of bits not yet consumed from the
  195. // last byte. If we are at the end of the block, bits is
  196. // either the number of bits in the last byte belonging to
  197. // the next block, or the number of pad bits after the
  198. // final block. In either of those cases, bits is in the
  199. // range 0..7.
  200. ; // (required due to C syntax oddity)
  201. int bits = strm.data_type & 0x1f;
  202. if (prev != -1) {
  203. // We are at the start of a new block. Clear the last
  204. // block bit, and check for special cases. If it is a
  205. // stored block, then emit the header and pad to the
  206. // next byte boundary. If it is a final, empty fixed
  207. // block, then excise it.
  208. // Some or all of the three header bits for this block
  209. // may already be in the bit buffer. Load any remaining
  210. // header bits into the bit buffer.
  211. if (num - prev < 3) {
  212. buf += (unsigned long)*put++ << num;
  213. num += 8;
  214. }
  215. // Set last to have a 1 in the position of the last
  216. // block bit in the bit buffer.
  217. unsigned long last = (unsigned long)1 << prev;
  218. if (((buf >> prev) & 7) == 3) {
  219. // This is a final fixed block. Load at least ten
  220. // bits from this block, including the header, into
  221. // the bit buffer. We already have at least three,
  222. // so at most one more byte needs to be loaded.
  223. if (num - prev < 10) {
  224. if (put == strm.next_in)
  225. // Need to go get and process more input.
  226. // We'll end up back here to finish this.
  227. break;
  228. buf += (unsigned long)*put++ << num;
  229. num += 8;
  230. }
  231. if (((buf >> prev) & 0x3ff) == 3) {
  232. // That final fixed block is empty. Delete it
  233. // to avoid adding an empty block every time a
  234. // gzip stream is normalized.
  235. num = prev;
  236. buf &= last - 1; // zero the pad bits
  237. }
  238. }
  239. else if (((buf >> prev) & 6) == 0) {
  240. // This is a stored block. Flush to the next
  241. // byte boundary after the three-bit header.
  242. num = (prev + 10) & ~7;
  243. buf &= last - 1; // zero the pad bits
  244. }
  245. // Clear the last block bit.
  246. buf &= ~last;
  247. // Write out complete bytes in the bit buffer.
  248. while (num >= 8) {
  249. putc(buf, out);
  250. buf >>= 8;
  251. num -= 8;
  252. }
  253. // If no more bytes left to process, then we have
  254. // consumed the byte that had bits from the next block.
  255. if (put == strm.next_in)
  256. bits = 0;
  257. }
  258. // We are done handling the deflate block header. Now copy
  259. // all or almost all of the remaining compressed data that
  260. // has been processed so far. Don't copy one byte at the
  261. // end if it contains bits from the next deflate block or
  262. // pad bits at the end of a deflate block.
  263. // mix is 1 if we are at the end of a deflate block, and if
  264. // some of the bits in the last byte follow this block. mix
  265. // is 0 if we are in the middle of a deflate block, if the
  266. // deflate block ended on a byte boundary, or if all of the
  267. // compressed data processed so far has been consumed.
  268. int mix = (strm.data_type & 0x80) && bits;
  269. // Copy all of the processed compressed data to the output,
  270. // except for the last byte if it contains bits from the
  271. // next deflate block or pad bits at the end of the deflate
  272. // stream. Copy the data after shifting in num bits from
  273. // buf in front of it, leaving num bits from the end of the
  274. // compressed data in buf when done.
  275. unsigned char *end = strm.next_in - mix;
  276. if (put < end) {
  277. if (num)
  278. // Insert num bits from buf before the data being
  279. // copied.
  280. do {
  281. buf += (unsigned)(*put++) << num;
  282. putc(buf, out);
  283. buf >>= 8;
  284. } while (put < end);
  285. else {
  286. // No shifting needed -- write directly.
  287. fwrite(put, 1, end - put, out);
  288. put = end;
  289. }
  290. }
  291. // Process the last processed byte if it wasn't written.
  292. if (mix) {
  293. // Load the last byte into the bit buffer.
  294. buf += (unsigned)(*put++) << num;
  295. num += 8;
  296. if (strm.data_type & 0x40) {
  297. // We are at the end of the deflate stream and
  298. // there are bits pad bits. Discard the pad bits
  299. // and write a byte to the output, if available.
  300. // Leave the num bits left over in buf to prepend
  301. // to the next deflate stream.
  302. num -= bits;
  303. if (num >= 8) {
  304. putc(buf, out);
  305. num -= 8;
  306. buf >>= 8;
  307. }
  308. // Force the pad bits in the bit buffer to zeros.
  309. buf &= ((unsigned long)1 << num) - 1;
  310. // Don't need to set prev here since going to TAIL.
  311. }
  312. else
  313. // At the end of an internal deflate block. Leave
  314. // the last byte in the bit buffer to examine on
  315. // the next entry to BLOCK, when more bits from the
  316. // next block will be available.
  317. prev = num - bits; // number of bits in buffer
  318. // from current block
  319. }
  320. // Don't have a byte left over, so we are in the middle of
  321. // a deflate block, or the deflate block ended on a byte
  322. // boundary. Set prev appropriately for the next entry into
  323. // BLOCK.
  324. else if (strm.data_type & 0x80)
  325. // The block ended on a byte boundary, so no header
  326. // bits are in the bit buffer.
  327. prev = num;
  328. else
  329. // In the middle of a deflate block, so no header here.
  330. prev = -1;
  331. // Check for the end of the deflate stream.
  332. if ((strm.data_type & 0xc0) == 0xc0) {
  333. // That ends the deflate stream on the input side, the
  334. // pad bits were discarded, and any remaining bits from
  335. // the last block in the stream are saved in the bit
  336. // buffer to prepend to the next stream. Process the
  337. // gzip trailer next.
  338. tail = 0;
  339. part = 0;
  340. state = TAIL;
  341. }
  342. break;
  343. case TAIL:
  344. // Accumulate available trailer bytes to update the total
  345. // CRC and the total uncompressed length.
  346. do {
  347. part = (part >> 8) + ((unsigned long)(*put++) << 24);
  348. tail++;
  349. if (tail == 4) {
  350. // Update the total CRC.
  351. z_off_t len2 = memb;
  352. if (len2 < 0 || (unsigned long long)len2 != memb)
  353. BYE("overflow error");
  354. crc = crc ? crc32_combine(crc, part, len2) : part;
  355. part = 0;
  356. }
  357. else if (tail == 8) {
  358. // Update the total uncompressed length. (It's ok
  359. // if this sum is done modulo 2^32.)
  360. len += part;
  361. // At the end of a member. Set up to inflate an
  362. // immediately following gzip member. (If we made
  363. // it this far, then the trailer was valid.)
  364. if (inflateReset(&strm) != Z_OK)
  365. BYE("internal error");
  366. state = BETWEEN;
  367. break;
  368. }
  369. } while (put < strm.next_in);
  370. break;
  371. }
  372. // Process the input buffer until completely consumed.
  373. } while (strm.avail_in > 0);
  374. // Process input until end of file, invalid input, or i/o error.
  375. } while (more);
  376. // Done with the inflate engine.
  377. inflateEnd(&strm);
  378. // Verify the validity of the input.
  379. if (state != BETWEEN)
  380. BYE("input invalid: incomplete gzip stream");
  381. // Write the remaining deflate stream bits, followed by a terminating
  382. // deflate fixed block.
  383. buf += (unsigned long)3 << num;
  384. putc(buf, out);
  385. putc(buf >> 8, out);
  386. if (num > 6)
  387. putc(0, out);
  388. // Write the gzip trailer, which is the CRC and the uncompressed length
  389. // modulo 2^32, both in little-endian order.
  390. putc(crc, out);
  391. putc(crc >> 8, out);
  392. putc(crc >> 16, out);
  393. putc(crc >> 24, out);
  394. putc(len, out);
  395. putc(len >> 8, out);
  396. putc(len >> 16, out);
  397. putc(len >> 24, out);
  398. fflush(out);
  399. // Check for any i/o errors.
  400. if (ferror(in) || ferror(out))
  401. BYE("i/o error: %s", strerror(errno));
  402. // All good!
  403. *err = NULL;
  404. return 0;
  405. }
  406. // Normalize the gzip stream on stdin, writing the result to stdout.
  407. int main(void) {
  408. // Avoid end-of-line conversions on evil operating systems.
  409. SET_BINARY_MODE(stdin);
  410. SET_BINARY_MODE(stdout);
  411. // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
  412. char *err;
  413. int ret = gzip_normalize(stdin, stdout, &err);
  414. if (ret)
  415. fprintf(stderr, "gznorm error: %s\n", err);
  416. free(err);
  417. return ret;
  418. }