decodecorpus.c 68 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929
  1. /*
  2. * Copyright (c) Yann Collet, Facebook, Inc.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #include <limits.h>
  11. #include <math.h>
  12. #include <stddef.h>
  13. #include <stdio.h>
  14. #include <stdlib.h>
  15. #include <string.h>
  16. #include "util.h"
  17. #include "timefn.h" /* UTIL_clockSpanMicro, SEC_TO_MICRO, UTIL_TIME_INITIALIZER */
  18. #include "zstd.h"
  19. #include "zstd_internal.h"
  20. #include "mem.h"
  21. #define ZDICT_STATIC_LINKING_ONLY
  22. #include "zdict.h"
  23. /* Direct access to internal compression functions is required */
  24. #include "zstd_compress.c"
  25. #define XXH_STATIC_LINKING_ONLY
  26. #include "xxhash.h" /* XXH64 */
  27. #ifndef MIN
  28. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  29. #endif
  30. #ifndef MAX_PATH
  31. #ifdef PATH_MAX
  32. #define MAX_PATH PATH_MAX
  33. #else
  34. #define MAX_PATH 256
  35. #endif
  36. #endif
  37. /*-************************************
  38. * DISPLAY Macros
  39. **************************************/
  40. #define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
  41. #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
  42. static U32 g_displayLevel = 2;
  43. #define DISPLAYUPDATE(...) \
  44. do { \
  45. if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || \
  46. (g_displayLevel >= 4)) { \
  47. g_displayClock = UTIL_getTime(); \
  48. DISPLAY(__VA_ARGS__); \
  49. if (g_displayLevel >= 4) fflush(stderr); \
  50. } \
  51. } while (0)
  52. static const U64 g_refreshRate = SEC_TO_MICRO / 6;
  53. static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
  54. #define CHECKERR(code) \
  55. do { \
  56. if (ZSTD_isError(code)) { \
  57. DISPLAY("Error occurred while generating data: %s\n", \
  58. ZSTD_getErrorName(code)); \
  59. exit(1); \
  60. } \
  61. } while (0)
  62. /*-*******************************************************
  63. * Random function
  64. *********************************************************/
  65. static U32 RAND(U32* src)
  66. {
  67. #define RAND_rotl32(x,r) ((x << r) | (x >> (32 - r)))
  68. static const U32 prime1 = 2654435761U;
  69. static const U32 prime2 = 2246822519U;
  70. U32 rand32 = *src;
  71. rand32 *= prime1;
  72. rand32 += prime2;
  73. rand32 = RAND_rotl32(rand32, 13);
  74. *src = rand32;
  75. return RAND_rotl32(rand32, 27);
  76. #undef RAND_rotl32
  77. }
  78. #define DISTSIZE (8192)
  79. /* Write `size` bytes into `ptr`, all of which are less than or equal to `maxSymb` */
  80. static void RAND_bufferMaxSymb(U32* seed, void* ptr, size_t size, int maxSymb)
  81. {
  82. size_t i;
  83. BYTE* op = ptr;
  84. for (i = 0; i < size; i++) {
  85. op[i] = (BYTE) (RAND(seed) % (maxSymb + 1));
  86. }
  87. }
  88. /* Write `size` random bytes into `ptr` */
  89. static void RAND_buffer(U32* seed, void* ptr, size_t size)
  90. {
  91. size_t i;
  92. BYTE* op = ptr;
  93. for (i = 0; i + 4 <= size; i += 4) {
  94. MEM_writeLE32(op + i, RAND(seed));
  95. }
  96. for (; i < size; i++) {
  97. op[i] = RAND(seed) & 0xff;
  98. }
  99. }
  100. /* Write `size` bytes into `ptr` following the distribution `dist` */
  101. static void RAND_bufferDist(U32* seed, BYTE* dist, void* ptr, size_t size)
  102. {
  103. size_t i;
  104. BYTE* op = ptr;
  105. for (i = 0; i < size; i++) {
  106. op[i] = dist[RAND(seed) % DISTSIZE];
  107. }
  108. }
  109. /* Generate a random distribution where the frequency of each symbol follows a
  110. * geometric distribution defined by `weight`
  111. * `dist` should have size at least `DISTSIZE` */
  112. static void RAND_genDist(U32* seed, BYTE* dist, double weight)
  113. {
  114. size_t i = 0;
  115. size_t statesLeft = DISTSIZE;
  116. BYTE symb = (BYTE) (RAND(seed) % 256);
  117. BYTE step = (BYTE) ((RAND(seed) % 256) | 1); /* force it to be odd so it's relatively prime to 256 */
  118. while (i < DISTSIZE) {
  119. size_t states = ((size_t)(weight * statesLeft)) + 1;
  120. size_t j;
  121. for (j = 0; j < states && i < DISTSIZE; j++, i++) {
  122. dist[i] = symb;
  123. }
  124. symb += step;
  125. statesLeft -= states;
  126. }
  127. }
  128. /* Generates a random number in the range [min, max) */
  129. static inline U32 RAND_range(U32* seed, U32 min, U32 max)
  130. {
  131. return (RAND(seed) % (max-min)) + min;
  132. }
  133. #define ROUND(x) ((U32)(x + 0.5))
  134. /* Generates a random number in an exponential distribution with mean `mean` */
  135. static double RAND_exp(U32* seed, double mean)
  136. {
  137. double const u = RAND(seed) / (double) UINT_MAX;
  138. return log(1-u) * (-mean);
  139. }
  140. /*-*******************************************************
  141. * Constants and Structs
  142. *********************************************************/
  143. const char *BLOCK_TYPES[] = {"raw", "rle", "compressed"};
  144. #define MAX_DECOMPRESSED_SIZE_LOG 20
  145. #define MAX_DECOMPRESSED_SIZE (1ULL << MAX_DECOMPRESSED_SIZE_LOG)
  146. #define MAX_WINDOW_LOG 22 /* Recommended support is 8MB, so limit to 4MB + mantissa */
  147. #define MIN_SEQ_LEN (3)
  148. #define MAX_NB_SEQ ((ZSTD_BLOCKSIZE_MAX + MIN_SEQ_LEN - 1) / MIN_SEQ_LEN)
  149. BYTE CONTENT_BUFFER[MAX_DECOMPRESSED_SIZE];
  150. BYTE FRAME_BUFFER[MAX_DECOMPRESSED_SIZE * 2];
  151. BYTE LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX];
  152. seqDef SEQUENCE_BUFFER[MAX_NB_SEQ];
  153. BYTE SEQUENCE_LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; /* storeSeq expects a place to copy literals to */
  154. BYTE SEQUENCE_LLCODE[ZSTD_BLOCKSIZE_MAX];
  155. BYTE SEQUENCE_MLCODE[ZSTD_BLOCKSIZE_MAX];
  156. BYTE SEQUENCE_OFCODE[ZSTD_BLOCKSIZE_MAX];
  157. U64 WKSP[HUF_WORKSPACE_SIZE_U64];
  158. typedef struct {
  159. size_t contentSize; /* 0 means unknown (unless contentSize == windowSize == 0) */
  160. unsigned windowSize; /* contentSize >= windowSize means single segment */
  161. } frameHeader_t;
  162. /* For repeat modes */
  163. typedef struct {
  164. U32 rep[ZSTD_REP_NUM];
  165. int hufInit;
  166. /* the distribution used in the previous block for repeat mode */
  167. BYTE hufDist[DISTSIZE];
  168. HUF_CElt hufTable [HUF_CTABLE_SIZE_ST(255)];
  169. int fseInit;
  170. FSE_CTable offcodeCTable [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
  171. FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
  172. FSE_CTable litlengthCTable [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
  173. /* Symbols that were present in the previous distribution, for use with
  174. * set_repeat */
  175. BYTE litlengthSymbolSet[36];
  176. BYTE offsetSymbolSet[29];
  177. BYTE matchlengthSymbolSet[53];
  178. } cblockStats_t;
  179. typedef struct {
  180. void* data;
  181. void* dataStart;
  182. void* dataEnd;
  183. void* src;
  184. void* srcStart;
  185. void* srcEnd;
  186. frameHeader_t header;
  187. cblockStats_t stats;
  188. cblockStats_t oldStats; /* so they can be rolled back if uncompressible */
  189. } frame_t;
  190. typedef struct {
  191. int useDict;
  192. U32 dictID;
  193. size_t dictContentSize;
  194. BYTE* dictContent;
  195. } dictInfo;
  196. typedef enum {
  197. gt_frame = 0, /* generate frames */
  198. gt_block, /* generate compressed blocks without block/frame headers */
  199. } genType_e;
  200. /*-*******************************************************
  201. * Global variables (set from command line)
  202. *********************************************************/
  203. U32 g_maxDecompressedSizeLog = MAX_DECOMPRESSED_SIZE_LOG; /* <= 20 */
  204. U32 g_maxBlockSize = ZSTD_BLOCKSIZE_MAX; /* <= 128 KB */
  205. /*-*******************************************************
  206. * Generator Functions
  207. *********************************************************/
  208. struct {
  209. int contentSize; /* force the content size to be present */
  210. } opts; /* advanced options on generation */
  211. /* Generate and write a random frame header */
  212. static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info)
  213. {
  214. BYTE* const op = frame->data;
  215. size_t pos = 0;
  216. frameHeader_t fh;
  217. BYTE windowByte = 0;
  218. int singleSegment = 0;
  219. int contentSizeFlag = 0;
  220. int fcsCode = 0;
  221. memset(&fh, 0, sizeof(fh));
  222. /* generate window size */
  223. {
  224. /* Follow window algorithm from specification */
  225. int const exponent = RAND(seed) % (MAX_WINDOW_LOG - 10);
  226. int const mantissa = RAND(seed) % 8;
  227. windowByte = (BYTE) ((exponent << 3) | mantissa);
  228. fh.windowSize = (1U << (exponent + 10));
  229. fh.windowSize += fh.windowSize / 8 * mantissa;
  230. }
  231. {
  232. /* Generate random content size */
  233. size_t highBit;
  234. if (RAND(seed) & 7 && g_maxDecompressedSizeLog > 7) {
  235. /* do content of at least 128 bytes */
  236. highBit = 1ULL << RAND_range(seed, 7, g_maxDecompressedSizeLog);
  237. } else if (RAND(seed) & 3) {
  238. /* do small content */
  239. highBit = 1ULL << RAND_range(seed, 0, MIN(7, 1U << g_maxDecompressedSizeLog));
  240. } else {
  241. /* 0 size frame */
  242. highBit = 0;
  243. }
  244. fh.contentSize = highBit ? highBit + (RAND(seed) % highBit) : 0;
  245. /* provide size sometimes */
  246. contentSizeFlag = opts.contentSize | (RAND(seed) & 1);
  247. if (contentSizeFlag && (fh.contentSize == 0 || !(RAND(seed) & 7))) {
  248. /* do single segment sometimes */
  249. fh.windowSize = (U32) fh.contentSize;
  250. singleSegment = 1;
  251. }
  252. }
  253. if (contentSizeFlag) {
  254. /* Determine how large fcs field has to be */
  255. int minFcsCode = (fh.contentSize >= 256) +
  256. (fh.contentSize >= 65536 + 256) +
  257. (fh.contentSize > 0xFFFFFFFFU);
  258. if (!singleSegment && !minFcsCode) {
  259. minFcsCode = 1;
  260. }
  261. fcsCode = minFcsCode + (RAND(seed) % (4 - minFcsCode));
  262. if (fcsCode == 1 && fh.contentSize < 256) fcsCode++;
  263. }
  264. /* write out the header */
  265. MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER);
  266. pos += 4;
  267. {
  268. /*
  269. * fcsCode: 2-bit flag specifying how many bytes used to represent Frame_Content_Size (bits 7-6)
  270. * singleSegment: 1-bit flag describing if data must be regenerated within a single continuous memory segment. (bit 5)
  271. * contentChecksumFlag: 1-bit flag that is set if frame includes checksum at the end -- set to 1 below (bit 2)
  272. * dictBits: 2-bit flag describing how many bytes Dictionary_ID uses -- set to 3 (bits 1-0)
  273. * For more information: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header
  274. */
  275. int const dictBits = info.useDict ? 3 : 0;
  276. BYTE const frameHeaderDescriptor =
  277. (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2) | dictBits);
  278. op[pos++] = frameHeaderDescriptor;
  279. }
  280. if (!singleSegment) {
  281. op[pos++] = windowByte;
  282. }
  283. if (info.useDict) {
  284. MEM_writeLE32(op + pos, (U32) info.dictID);
  285. pos += 4;
  286. }
  287. if (contentSizeFlag) {
  288. switch (fcsCode) {
  289. default: /* Impossible */
  290. case 0: op[pos++] = (BYTE) fh.contentSize; break;
  291. case 1: MEM_writeLE16(op + pos, (U16) (fh.contentSize - 256)); pos += 2; break;
  292. case 2: MEM_writeLE32(op + pos, (U32) fh.contentSize); pos += 4; break;
  293. case 3: MEM_writeLE64(op + pos, (U64) fh.contentSize); pos += 8; break;
  294. }
  295. }
  296. DISPLAYLEVEL(3, " frame content size:\t%u\n", (unsigned)fh.contentSize);
  297. DISPLAYLEVEL(3, " frame window size:\t%u\n", fh.windowSize);
  298. DISPLAYLEVEL(3, " content size flag:\t%d\n", contentSizeFlag);
  299. DISPLAYLEVEL(3, " single segment flag:\t%d\n", singleSegment);
  300. frame->data = op + pos;
  301. frame->header = fh;
  302. }
  303. /* Write a literal block in either raw or RLE form, return the literals size */
  304. static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t contentSize)
  305. {
  306. BYTE* op = (BYTE*)frame->data;
  307. int const type = RAND(seed) % 2;
  308. int const sizeFormatDesc = RAND(seed) % 8;
  309. size_t litSize;
  310. size_t maxLitSize = MIN(contentSize, g_maxBlockSize);
  311. if (sizeFormatDesc == 0) {
  312. /* Size_FormatDesc = ?0 */
  313. maxLitSize = MIN(maxLitSize, 31);
  314. } else if (sizeFormatDesc <= 4) {
  315. /* Size_FormatDesc = 01 */
  316. maxLitSize = MIN(maxLitSize, 4095);
  317. } else {
  318. /* Size_Format = 11 */
  319. maxLitSize = MIN(maxLitSize, 1048575);
  320. }
  321. litSize = RAND(seed) % (maxLitSize + 1);
  322. if (frame->src == frame->srcStart && litSize == 0) {
  323. litSize = 1; /* no empty literals if there's nothing preceding this block */
  324. }
  325. if (litSize + 3 > contentSize) {
  326. litSize = contentSize; /* no matches shorter than 3 are allowed */
  327. }
  328. /* use smallest size format that fits */
  329. if (litSize < 32) {
  330. op[0] = (type | (0 << 2) | (litSize << 3)) & 0xff;
  331. op += 1;
  332. } else if (litSize < 4096) {
  333. op[0] = (type | (1 << 2) | (litSize << 4)) & 0xff;
  334. op[1] = (litSize >> 4) & 0xff;
  335. op += 2;
  336. } else {
  337. op[0] = (type | (3 << 2) | (litSize << 4)) & 0xff;
  338. op[1] = (litSize >> 4) & 0xff;
  339. op[2] = (litSize >> 12) & 0xff;
  340. op += 3;
  341. }
  342. if (type == 0) {
  343. /* Raw literals */
  344. DISPLAYLEVEL(4, " raw literals\n");
  345. RAND_buffer(seed, LITERAL_BUFFER, litSize);
  346. memcpy(op, LITERAL_BUFFER, litSize);
  347. op += litSize;
  348. } else {
  349. /* RLE literals */
  350. BYTE const symb = (BYTE) (RAND(seed) % 256);
  351. DISPLAYLEVEL(4, " rle literals: 0x%02x\n", (unsigned)symb);
  352. memset(LITERAL_BUFFER, symb, litSize);
  353. op[0] = symb;
  354. op++;
  355. }
  356. frame->data = op;
  357. return litSize;
  358. }
  359. /* Generate a Huffman header for the given source */
  360. static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t dstSize,
  361. const void* src, size_t srcSize)
  362. {
  363. BYTE* const ostart = (BYTE*)dst;
  364. BYTE* op = ostart;
  365. unsigned huffLog = 11;
  366. unsigned maxSymbolValue = 255;
  367. unsigned count[HUF_SYMBOLVALUE_MAX+1];
  368. /* Scan input and build symbol stats */
  369. { size_t const largest = HIST_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP, sizeof(WKSP));
  370. assert(!HIST_isError(largest));
  371. if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 0; } /* single symbol, rle */
  372. if (largest <= (srcSize >> 7)+1) return 0; /* Fast heuristic : not compressible enough */
  373. }
  374. /* Build Huffman Tree */
  375. /* Max Huffman log is 11, min is highbit(maxSymbolValue)+1 */
  376. huffLog = RAND_range(seed, ZSTD_highbit32(maxSymbolValue)+1, huffLog+1);
  377. DISPLAYLEVEL(6, " huffman log: %u\n", huffLog);
  378. { size_t const maxBits = HUF_buildCTable_wksp (hufTable, count, maxSymbolValue, huffLog, WKSP, sizeof(WKSP));
  379. CHECKERR(maxBits);
  380. huffLog = (U32)maxBits;
  381. }
  382. /* Write table description header */
  383. { size_t const hSize = HUF_writeCTable (op, dstSize, hufTable, maxSymbolValue, huffLog);
  384. if (hSize + 12 >= srcSize) return 0; /* not useful to try compression */
  385. op += hSize;
  386. }
  387. return op - ostart;
  388. }
  389. /* Write a Huffman coded literals block and return the literals size */
  390. static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t contentSize)
  391. {
  392. BYTE* origop = (BYTE*)frame->data;
  393. BYTE* opend = (BYTE*)frame->dataEnd;
  394. BYTE* op;
  395. BYTE* const ostart = origop;
  396. int const sizeFormat = RAND(seed) % 4;
  397. size_t litSize;
  398. size_t hufHeaderSize = 0;
  399. size_t compressedSize = 0;
  400. size_t maxLitSize = MIN(contentSize-3, g_maxBlockSize);
  401. symbolEncodingType_e hType;
  402. if (contentSize < 64) {
  403. /* make sure we get reasonably-sized literals for compression */
  404. return ERROR(GENERIC);
  405. }
  406. DISPLAYLEVEL(4, " compressed literals\n");
  407. switch (sizeFormat) {
  408. case 0: /* fall through, size is the same as case 1 */
  409. case 1:
  410. maxLitSize = MIN(maxLitSize, 1023);
  411. origop += 3;
  412. break;
  413. case 2:
  414. maxLitSize = MIN(maxLitSize, 16383);
  415. origop += 4;
  416. break;
  417. case 3:
  418. maxLitSize = MIN(maxLitSize, 262143);
  419. origop += 5;
  420. break;
  421. default:; /* impossible */
  422. }
  423. do {
  424. op = origop;
  425. do {
  426. litSize = RAND(seed) % (maxLitSize + 1);
  427. } while (litSize < 32); /* avoid small literal sizes */
  428. if (litSize + 3 > contentSize) {
  429. litSize = contentSize; /* no matches shorter than 3 are allowed */
  430. }
  431. /* most of the time generate a new distribution */
  432. if ((RAND(seed) & 3) || !frame->stats.hufInit) {
  433. do {
  434. if (RAND(seed) & 3) {
  435. /* add 10 to ensure some compressibility */
  436. double const weight = ((RAND(seed) % 90) + 10) / 100.0;
  437. DISPLAYLEVEL(5, " distribution weight: %d%%\n",
  438. (int)(weight * 100));
  439. RAND_genDist(seed, frame->stats.hufDist, weight);
  440. } else {
  441. /* sometimes do restricted range literals to force
  442. * non-huffman headers */
  443. DISPLAYLEVEL(5, " small range literals\n");
  444. RAND_bufferMaxSymb(seed, frame->stats.hufDist, DISTSIZE,
  445. 15);
  446. }
  447. RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER,
  448. litSize);
  449. /* generate the header from the distribution instead of the
  450. * actual data to avoid bugs with symbols that were in the
  451. * distribution but never showed up in the output */
  452. hufHeaderSize = writeHufHeader(
  453. seed, frame->stats.hufTable, op, opend - op,
  454. frame->stats.hufDist, DISTSIZE);
  455. CHECKERR(hufHeaderSize);
  456. /* repeat until a valid header is written */
  457. } while (hufHeaderSize == 0);
  458. op += hufHeaderSize;
  459. hType = set_compressed;
  460. frame->stats.hufInit = 1;
  461. } else {
  462. /* repeat the distribution/table from last time */
  463. DISPLAYLEVEL(5, " huffman repeat stats\n");
  464. RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER,
  465. litSize);
  466. hufHeaderSize = 0;
  467. hType = set_repeat;
  468. }
  469. do {
  470. compressedSize =
  471. sizeFormat == 0
  472. ? HUF_compress1X_usingCTable(
  473. op, opend - op, LITERAL_BUFFER, litSize,
  474. frame->stats.hufTable)
  475. : HUF_compress4X_usingCTable(
  476. op, opend - op, LITERAL_BUFFER, litSize,
  477. frame->stats.hufTable);
  478. CHECKERR(compressedSize);
  479. /* this only occurs when it could not compress or similar */
  480. } while (compressedSize <= 0);
  481. op += compressedSize;
  482. compressedSize += hufHeaderSize;
  483. DISPLAYLEVEL(5, " regenerated size: %u\n", (unsigned)litSize);
  484. DISPLAYLEVEL(5, " compressed size: %u\n", (unsigned)compressedSize);
  485. if (compressedSize >= litSize) {
  486. DISPLAYLEVEL(5, " trying again\n");
  487. /* if we have to try again, reset the stats so we don't accidentally
  488. * try to repeat a distribution we just made */
  489. frame->stats = frame->oldStats;
  490. } else {
  491. break;
  492. }
  493. } while (1);
  494. /* write header */
  495. switch (sizeFormat) {
  496. case 0: /* fall through, size is the same as case 1 */
  497. case 1: {
  498. U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
  499. ((U32)compressedSize << 14);
  500. MEM_writeLE24(ostart, header);
  501. break;
  502. }
  503. case 2: {
  504. U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
  505. ((U32)compressedSize << 18);
  506. MEM_writeLE32(ostart, header);
  507. break;
  508. }
  509. case 3: {
  510. U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
  511. ((U32)compressedSize << 22);
  512. MEM_writeLE32(ostart, header);
  513. ostart[4] = (BYTE)(compressedSize >> 10);
  514. break;
  515. }
  516. default:; /* impossible */
  517. }
  518. frame->data = op;
  519. return litSize;
  520. }
  521. static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize)
  522. {
  523. /* only do compressed for larger segments to avoid compressibility issues */
  524. if (RAND(seed) & 7 && contentSize >= 64) {
  525. return writeLiteralsBlockCompressed(seed, frame, contentSize);
  526. } else {
  527. return writeLiteralsBlockSimple(seed, frame, contentSize);
  528. }
  529. }
  530. static inline void initSeqStore(seqStore_t *seqStore) {
  531. seqStore->maxNbSeq = MAX_NB_SEQ;
  532. seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX;
  533. seqStore->sequencesStart = SEQUENCE_BUFFER;
  534. seqStore->litStart = SEQUENCE_LITERAL_BUFFER;
  535. seqStore->llCode = SEQUENCE_LLCODE;
  536. seqStore->mlCode = SEQUENCE_MLCODE;
  537. seqStore->ofCode = SEQUENCE_OFCODE;
  538. ZSTD_resetSeqStore(seqStore);
  539. }
  540. /* Randomly generate sequence commands */
  541. static U32
  542. generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
  543. size_t contentSize, size_t literalsSize, dictInfo info)
  544. {
  545. /* The total length of all the matches */
  546. size_t const remainingMatch = contentSize - literalsSize;
  547. size_t excessMatch = 0;
  548. U32 numSequences = 0;
  549. U32 i;
  550. const BYTE* literals = LITERAL_BUFFER;
  551. BYTE* srcPtr = frame->src;
  552. if (literalsSize != contentSize) {
  553. /* each match must be at least MIN_SEQ_LEN, so this is the maximum
  554. * number of sequences we can have */
  555. U32 const maxSequences = (U32)remainingMatch / MIN_SEQ_LEN;
  556. numSequences = (RAND(seed) % maxSequences) + 1;
  557. /* the extra match lengths we have to allocate to each sequence */
  558. excessMatch = remainingMatch - numSequences * MIN_SEQ_LEN;
  559. }
  560. DISPLAYLEVEL(5, " total match lengths: %u\n", (unsigned)remainingMatch);
  561. for (i = 0; i < numSequences; i++) {
  562. /* Generate match and literal lengths by exponential distribution to
  563. * ensure nice numbers */
  564. U32 matchLen =
  565. MIN_SEQ_LEN +
  566. ROUND(RAND_exp(seed, excessMatch / (double)(numSequences - i)));
  567. U32 literalLen =
  568. (RAND(seed) & 7)
  569. ? ROUND(RAND_exp(seed,
  570. literalsSize /
  571. (double)(numSequences - i)))
  572. : 0;
  573. /* actual offset, code to send, and point to copy up to when shifting
  574. * codes in the repeat offsets history */
  575. U32 offset, offsetCode, repIndex;
  576. /* bounds checks */
  577. matchLen = (U32) MIN(matchLen, excessMatch + MIN_SEQ_LEN);
  578. literalLen = MIN(literalLen, (U32) literalsSize);
  579. if (i == 0 && srcPtr == frame->srcStart && literalLen == 0) literalLen = 1;
  580. if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + (U32) excessMatch;
  581. memcpy(srcPtr, literals, literalLen);
  582. srcPtr += literalLen;
  583. do {
  584. if (RAND(seed) & 7) {
  585. /* do a normal offset */
  586. U32 const dataDecompressed = (U32)((BYTE*)srcPtr-(BYTE*)frame->srcStart);
  587. offset = (RAND(seed) %
  588. MIN(frame->header.windowSize,
  589. (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) +
  590. 1;
  591. if (info.useDict && (RAND(seed) & 1) && i + 1 != numSequences && dataDecompressed < frame->header.windowSize) {
  592. /* need to occasionally generate offsets that go past the start */
  593. /* including i+1 != numSequences because the last sequences has to adhere to predetermined contentSize */
  594. U32 lenPastStart = (RAND(seed) % info.dictContentSize) + 1;
  595. offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart)+lenPastStart;
  596. if (offset > frame->header.windowSize) {
  597. if (lenPastStart < MIN_SEQ_LEN) {
  598. /* when offset > windowSize, matchLen bound by end of dictionary (lenPastStart) */
  599. /* this also means that lenPastStart must be greater than MIN_SEQ_LEN */
  600. /* make sure lenPastStart does not go past dictionary start though */
  601. lenPastStart = MIN(lenPastStart+MIN_SEQ_LEN, (U32)info.dictContentSize);
  602. offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) + lenPastStart;
  603. }
  604. { U32 const matchLenBound = MIN(frame->header.windowSize, lenPastStart);
  605. matchLen = MIN(matchLen, matchLenBound);
  606. }
  607. }
  608. }
  609. offsetCode = STORE_OFFSET(offset);
  610. repIndex = 2;
  611. } else {
  612. /* do a repeat offset */
  613. U32 const randomRepIndex = RAND(seed) % 3;
  614. offsetCode = STORE_REPCODE(randomRepIndex + 1); /* expects values between 1 & 3 */
  615. if (literalLen > 0) {
  616. offset = frame->stats.rep[randomRepIndex];
  617. repIndex = randomRepIndex;
  618. } else {
  619. /* special case : literalLen == 0 */
  620. offset = randomRepIndex == 2 ? frame->stats.rep[0] - 1
  621. : frame->stats.rep[randomRepIndex + 1];
  622. repIndex = MIN(2, randomRepIndex + 1);
  623. }
  624. }
  625. } while (((!info.useDict) && (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0);
  626. { BYTE* const dictEnd = info.dictContent + info.dictContentSize;
  627. size_t j;
  628. for (j = 0; j < matchLen; j++) {
  629. if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) {
  630. /* copy from dictionary instead of literals */
  631. size_t const dictOffset = offset - (srcPtr - (BYTE*)frame->srcStart);
  632. *srcPtr = *(dictEnd - dictOffset);
  633. }
  634. else {
  635. *srcPtr = *(srcPtr-offset);
  636. }
  637. srcPtr++;
  638. } }
  639. { int r;
  640. for (r = repIndex; r > 0; r--) {
  641. frame->stats.rep[r] = frame->stats.rep[r - 1];
  642. }
  643. frame->stats.rep[0] = offset;
  644. }
  645. DISPLAYLEVEL(6, " LL: %5u OF: %5u ML: %5u",
  646. (unsigned)literalLen, (unsigned)offset, (unsigned)matchLen);
  647. DISPLAYLEVEL(7, " srcPos: %8u seqNb: %3u",
  648. (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart), (unsigned)i);
  649. DISPLAYLEVEL(6, "\n");
  650. if (STORED_IS_REPCODE(offsetCode)) { /* expects sumtype numeric representation of ZSTD_storeSeq() */
  651. DISPLAYLEVEL(7, " repeat offset: %d\n", (int)repIndex);
  652. }
  653. /* use libzstd sequence handling */
  654. ZSTD_storeSeq(seqStore, literalLen, literals, literals + literalLen,
  655. offsetCode, matchLen);
  656. literalsSize -= literalLen;
  657. excessMatch -= (matchLen - MIN_SEQ_LEN);
  658. literals += literalLen;
  659. }
  660. memcpy(srcPtr, literals, literalsSize);
  661. srcPtr += literalsSize;
  662. DISPLAYLEVEL(6, " excess literals: %5u", (unsigned)literalsSize);
  663. DISPLAYLEVEL(7, " srcPos: %8u", (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart));
  664. DISPLAYLEVEL(6, "\n");
  665. return numSequences;
  666. }
  667. static void initSymbolSet(const BYTE* symbols, size_t len, BYTE* set, BYTE maxSymbolValue)
  668. {
  669. size_t i;
  670. memset(set, 0, (size_t)maxSymbolValue+1);
  671. for (i = 0; i < len; i++) {
  672. set[symbols[i]] = 1;
  673. }
  674. }
  675. static int isSymbolSubset(const BYTE* symbols, size_t len, const BYTE* set, BYTE maxSymbolValue)
  676. {
  677. size_t i;
  678. for (i = 0; i < len; i++) {
  679. if (symbols[i] > maxSymbolValue || !set[symbols[i]]) {
  680. return 0;
  681. }
  682. }
  683. return 1;
  684. }
  685. static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr,
  686. size_t nbSeq)
  687. {
  688. /* This code is mostly copied from ZSTD_compressSequences in zstd_compress.c */
  689. unsigned count[MaxSeq+1];
  690. S16 norm[MaxSeq+1];
  691. FSE_CTable* CTable_LitLength = frame->stats.litlengthCTable;
  692. FSE_CTable* CTable_OffsetBits = frame->stats.offcodeCTable;
  693. FSE_CTable* CTable_MatchLength = frame->stats.matchlengthCTable;
  694. U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */
  695. const seqDef* const sequences = seqStorePtr->sequencesStart;
  696. const BYTE* const ofCodeTable = seqStorePtr->ofCode;
  697. const BYTE* const llCodeTable = seqStorePtr->llCode;
  698. const BYTE* const mlCodeTable = seqStorePtr->mlCode;
  699. BYTE* const oend = (BYTE*)frame->dataEnd;
  700. BYTE* op = (BYTE*)frame->data;
  701. BYTE* seqHead;
  702. BYTE scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE(MaxSeq, MaxFSELog)];
  703. /* literals compressing block removed so that can be done separately */
  704. /* Sequences Header */
  705. if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall);
  706. if (nbSeq < 0x7F) *op++ = (BYTE)nbSeq;
  707. else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
  708. else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
  709. if (nbSeq==0) {
  710. frame->data = op;
  711. return 0;
  712. }
  713. /* seqHead : flags for FSE encoding type */
  714. seqHead = op++;
  715. /* convert length/distances into codes */
  716. ZSTD_seqToCodes(seqStorePtr);
  717. /* CTable for Literal Lengths */
  718. { unsigned max = MaxLL;
  719. size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */
  720. assert(!HIST_isError(mostFrequent));
  721. if (frame->stats.fseInit && !(RAND(seed) & 3) &&
  722. isSymbolSubset(llCodeTable, nbSeq,
  723. frame->stats.litlengthSymbolSet, 35)) {
  724. /* maybe do repeat mode if we're allowed to */
  725. LLtype = set_repeat;
  726. } else if (mostFrequent == nbSeq) {
  727. /* do RLE if we have the chance */
  728. *op++ = llCodeTable[0];
  729. FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
  730. LLtype = set_rle;
  731. } else if (!(RAND(seed) & 3)) {
  732. /* maybe use the default distribution */
  733. CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)));
  734. LLtype = set_basic;
  735. } else {
  736. /* fall back on a full table */
  737. size_t nbSeq_1 = nbSeq;
  738. const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
  739. if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; }
  740. FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048);
  741. { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */
  742. if (FSE_isError(NCountSize)) return ERROR(GENERIC);
  743. op += NCountSize; }
  744. CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)));
  745. LLtype = set_compressed;
  746. } }
  747. /* CTable for Offsets */
  748. /* see Literal Lengths for descriptions of mode choices */
  749. { unsigned max = MaxOff;
  750. size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */
  751. assert(!HIST_isError(mostFrequent));
  752. if (frame->stats.fseInit && !(RAND(seed) & 3) &&
  753. isSymbolSubset(ofCodeTable, nbSeq,
  754. frame->stats.offsetSymbolSet, 28)) {
  755. Offtype = set_repeat;
  756. } else if (mostFrequent == nbSeq) {
  757. *op++ = ofCodeTable[0];
  758. FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
  759. Offtype = set_rle;
  760. } else if (!(RAND(seed) & 3)) {
  761. FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, DefaultMaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
  762. Offtype = set_basic;
  763. } else {
  764. size_t nbSeq_1 = nbSeq;
  765. const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
  766. if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; }
  767. FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048);
  768. { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */
  769. if (FSE_isError(NCountSize)) return ERROR(GENERIC);
  770. op += NCountSize; }
  771. FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
  772. Offtype = set_compressed;
  773. } }
  774. /* CTable for MatchLengths */
  775. /* see Literal Lengths for descriptions of mode choices */
  776. { unsigned max = MaxML;
  777. size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */
  778. assert(!HIST_isError(mostFrequent));
  779. if (frame->stats.fseInit && !(RAND(seed) & 3) &&
  780. isSymbolSubset(mlCodeTable, nbSeq,
  781. frame->stats.matchlengthSymbolSet, 52)) {
  782. MLtype = set_repeat;
  783. } else if (mostFrequent == nbSeq) {
  784. *op++ = *mlCodeTable;
  785. FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
  786. MLtype = set_rle;
  787. } else if (!(RAND(seed) & 3)) {
  788. /* sometimes do default distribution */
  789. FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
  790. MLtype = set_basic;
  791. } else {
  792. /* fall back on table */
  793. size_t nbSeq_1 = nbSeq;
  794. const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
  795. if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; }
  796. FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048);
  797. { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */
  798. if (FSE_isError(NCountSize)) return ERROR(GENERIC);
  799. op += NCountSize; }
  800. FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
  801. MLtype = set_compressed;
  802. } }
  803. frame->stats.fseInit = 1;
  804. initSymbolSet(llCodeTable, nbSeq, frame->stats.litlengthSymbolSet, 35);
  805. initSymbolSet(ofCodeTable, nbSeq, frame->stats.offsetSymbolSet, 28);
  806. initSymbolSet(mlCodeTable, nbSeq, frame->stats.matchlengthSymbolSet, 52);
  807. DISPLAYLEVEL(5, " LL type: %d OF type: %d ML type: %d\n", (unsigned)LLtype, (unsigned)Offtype, (unsigned)MLtype);
  808. *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
  809. /* Encoding Sequences */
  810. { BIT_CStream_t blockStream;
  811. FSE_CState_t stateMatchLength;
  812. FSE_CState_t stateOffsetBits;
  813. FSE_CState_t stateLitLength;
  814. RETURN_ERROR_IF(
  815. ERR_isError(BIT_initCStream(&blockStream, op, oend-op)),
  816. dstSize_tooSmall, "not enough space remaining");
  817. /* first symbols */
  818. FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
  819. FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]);
  820. FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]);
  821. BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
  822. if (MEM_32bits()) BIT_flushBits(&blockStream);
  823. BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]);
  824. if (MEM_32bits()) BIT_flushBits(&blockStream);
  825. BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]);
  826. BIT_flushBits(&blockStream);
  827. { size_t n;
  828. for (n=nbSeq-2 ; n<nbSeq ; n--) { /* intentional underflow */
  829. BYTE const llCode = llCodeTable[n];
  830. BYTE const ofCode = ofCodeTable[n];
  831. BYTE const mlCode = mlCodeTable[n];
  832. U32 const llBits = LL_bits[llCode];
  833. U32 const ofBits = ofCode; /* 32b*/ /* 64b*/
  834. U32 const mlBits = ML_bits[mlCode];
  835. /* (7)*/ /* (7)*/
  836. FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */
  837. FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */
  838. if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/
  839. FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */
  840. if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
  841. BIT_flushBits(&blockStream); /* (7)*/
  842. BIT_addBits(&blockStream, sequences[n].litLength, llBits);
  843. if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
  844. BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);
  845. if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/
  846. BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */
  847. BIT_flushBits(&blockStream); /* (7)*/
  848. } }
  849. FSE_flushCState(&blockStream, &stateMatchLength);
  850. FSE_flushCState(&blockStream, &stateOffsetBits);
  851. FSE_flushCState(&blockStream, &stateLitLength);
  852. { size_t const streamSize = BIT_closeCStream(&blockStream);
  853. if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */
  854. op += streamSize;
  855. } }
  856. frame->data = op;
  857. return 0;
  858. }
  859. static size_t writeSequencesBlock(U32* seed, frame_t* frame, size_t contentSize,
  860. size_t literalsSize, dictInfo info)
  861. {
  862. seqStore_t seqStore;
  863. size_t numSequences;
  864. initSeqStore(&seqStore);
  865. /* randomly generate sequences */
  866. numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize, info);
  867. /* write them out to the frame data */
  868. CHECKERR(writeSequences(seed, frame, &seqStore, numSequences));
  869. return numSequences;
  870. }
  871. static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize, dictInfo info)
  872. {
  873. BYTE* const blockStart = (BYTE*)frame->data;
  874. size_t literalsSize;
  875. size_t nbSeq;
  876. DISPLAYLEVEL(4, " compressed block:\n");
  877. literalsSize = writeLiteralsBlock(seed, frame, contentSize);
  878. DISPLAYLEVEL(4, " literals size: %u\n", (unsigned)literalsSize);
  879. nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize, info);
  880. DISPLAYLEVEL(4, " number of sequences: %u\n", (unsigned)nbSeq);
  881. return (BYTE*)frame->data - blockStart;
  882. }
  883. static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,
  884. int lastBlock, dictInfo info)
  885. {
  886. int const blockTypeDesc = RAND(seed) % 8;
  887. size_t blockSize;
  888. int blockType;
  889. BYTE *const header = (BYTE*)frame->data;
  890. BYTE *op = header + 3;
  891. DISPLAYLEVEL(4, " block:\n");
  892. DISPLAYLEVEL(4, " block content size: %u\n", (unsigned)contentSize);
  893. DISPLAYLEVEL(4, " last block: %s\n", lastBlock ? "yes" : "no");
  894. if (blockTypeDesc == 0) {
  895. /* Raw data frame */
  896. RAND_buffer(seed, frame->src, contentSize);
  897. memcpy(op, frame->src, contentSize);
  898. op += contentSize;
  899. blockType = 0;
  900. blockSize = contentSize;
  901. } else if (blockTypeDesc == 1 && frame->header.contentSize > 0) {
  902. /* RLE (Don't create RLE block if frame content is 0 since block size of 1 may exceed max block size)*/
  903. BYTE const symbol = RAND(seed) & 0xff;
  904. op[0] = symbol;
  905. memset(frame->src, symbol, contentSize);
  906. op++;
  907. blockType = 1;
  908. blockSize = contentSize;
  909. } else {
  910. /* compressed, most common */
  911. size_t compressedSize;
  912. blockType = 2;
  913. frame->oldStats = frame->stats;
  914. frame->data = op;
  915. compressedSize = writeCompressedBlock(seed, frame, contentSize, info);
  916. if (compressedSize >= contentSize) { /* compressed block must be strictly smaller than uncompressed one */
  917. blockType = 0;
  918. memcpy(op, frame->src, contentSize);
  919. op += contentSize;
  920. blockSize = contentSize; /* fall back on raw block if data doesn't
  921. compress */
  922. frame->stats = frame->oldStats; /* don't update the stats */
  923. } else {
  924. op += compressedSize;
  925. blockSize = compressedSize;
  926. }
  927. }
  928. frame->src = (BYTE*)frame->src + contentSize;
  929. DISPLAYLEVEL(4, " block type: %s\n", BLOCK_TYPES[blockType]);
  930. DISPLAYLEVEL(4, " block size field: %u\n", (unsigned)blockSize);
  931. header[0] = (BYTE) ((lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff);
  932. MEM_writeLE16(header + 1, (U16) (blockSize >> 5));
  933. frame->data = op;
  934. }
  935. static void writeBlocks(U32* seed, frame_t* frame, dictInfo info)
  936. {
  937. size_t contentLeft = frame->header.contentSize;
  938. size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize);
  939. while (1) {
  940. /* 1 in 4 chance of ending frame */
  941. int const lastBlock = contentLeft > maxBlockSize ? 0 : !(RAND(seed) & 3);
  942. size_t blockContentSize;
  943. if (lastBlock) {
  944. blockContentSize = contentLeft;
  945. } else {
  946. if (contentLeft > 0 && (RAND(seed) & 7)) {
  947. /* some variable size block */
  948. blockContentSize = RAND(seed) % (MIN(maxBlockSize, contentLeft)+1);
  949. } else if (contentLeft > maxBlockSize && (RAND(seed) & 1)) {
  950. /* some full size block */
  951. blockContentSize = maxBlockSize;
  952. } else {
  953. /* some empty block */
  954. blockContentSize = 0;
  955. }
  956. }
  957. writeBlock(seed, frame, blockContentSize, lastBlock, info);
  958. contentLeft -= blockContentSize;
  959. if (lastBlock) break;
  960. }
  961. }
  962. static void writeChecksum(frame_t* frame)
  963. {
  964. /* write checksum so implementations can verify their output */
  965. U64 digest = XXH64(frame->srcStart, (BYTE*)frame->src-(BYTE*)frame->srcStart, 0);
  966. DISPLAYLEVEL(3, " checksum: %08x\n", (unsigned)digest);
  967. MEM_writeLE32(frame->data, (U32)digest);
  968. frame->data = (BYTE*)frame->data + 4;
  969. }
  970. static void outputBuffer(const void* buf, size_t size, const char* const path)
  971. {
  972. /* write data out to file */
  973. const BYTE* ip = (const BYTE*)buf;
  974. FILE* out;
  975. if (path) {
  976. out = fopen(path, "wb");
  977. } else {
  978. out = stdout;
  979. }
  980. if (!out) {
  981. fprintf(stderr, "Failed to open file at %s: ", path);
  982. perror(NULL);
  983. exit(1);
  984. }
  985. { size_t fsize = size;
  986. size_t written = 0;
  987. while (written < fsize) {
  988. written += fwrite(ip + written, 1, fsize - written, out);
  989. if (ferror(out)) {
  990. fprintf(stderr, "Failed to write to file at %s: ", path);
  991. perror(NULL);
  992. exit(1);
  993. }
  994. }
  995. }
  996. if (path) {
  997. fclose(out);
  998. }
  999. }
  1000. static void initFrame(frame_t* fr)
  1001. {
  1002. memset(fr, 0, sizeof(*fr));
  1003. fr->data = fr->dataStart = FRAME_BUFFER;
  1004. fr->dataEnd = FRAME_BUFFER + sizeof(FRAME_BUFFER);
  1005. fr->src = fr->srcStart = CONTENT_BUFFER;
  1006. fr->srcEnd = CONTENT_BUFFER + sizeof(CONTENT_BUFFER);
  1007. /* init repeat codes */
  1008. fr->stats.rep[0] = 1;
  1009. fr->stats.rep[1] = 4;
  1010. fr->stats.rep[2] = 8;
  1011. }
  1012. /**
  1013. * Generated a single zstd compressed block with no block/frame header.
  1014. * Returns the final seed.
  1015. */
  1016. static U32 generateCompressedBlock(U32 seed, frame_t* frame, dictInfo info)
  1017. {
  1018. size_t blockContentSize;
  1019. int blockWritten = 0;
  1020. BYTE* op;
  1021. DISPLAYLEVEL(4, "block seed: %u\n", (unsigned)seed);
  1022. initFrame(frame);
  1023. op = (BYTE*)frame->data;
  1024. while (!blockWritten) {
  1025. size_t cSize;
  1026. /* generate window size */
  1027. { int const exponent = RAND(&seed) % (MAX_WINDOW_LOG - 10);
  1028. int const mantissa = RAND(&seed) % 8;
  1029. frame->header.windowSize = (1U << (exponent + 10));
  1030. frame->header.windowSize += (frame->header.windowSize / 8) * mantissa;
  1031. }
  1032. /* generate content size */
  1033. { size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize);
  1034. if (RAND(&seed) & 15) {
  1035. /* some full size blocks */
  1036. blockContentSize = maxBlockSize;
  1037. } else if (RAND(&seed) & 7 && g_maxBlockSize >= (1U << 7)) {
  1038. /* some small blocks <= 128 bytes*/
  1039. blockContentSize = RAND(&seed) % (1U << 7);
  1040. } else {
  1041. /* some variable size blocks */
  1042. blockContentSize = RAND(&seed) % maxBlockSize;
  1043. }
  1044. }
  1045. /* try generating a compressed block */
  1046. frame->oldStats = frame->stats;
  1047. frame->data = op;
  1048. cSize = writeCompressedBlock(&seed, frame, blockContentSize, info);
  1049. if (cSize >= blockContentSize) { /* compressed size must be strictly smaller than decompressed size : https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks */
  1050. /* data doesn't compress -- try again */
  1051. frame->stats = frame->oldStats; /* don't update the stats */
  1052. DISPLAYLEVEL(5, " can't compress block : try again \n");
  1053. } else {
  1054. blockWritten = 1;
  1055. DISPLAYLEVEL(4, " block size: %u \n", (unsigned)cSize);
  1056. frame->src = (BYTE*)frame->src + blockContentSize;
  1057. }
  1058. }
  1059. return seed;
  1060. }
  1061. /* Return the final seed */
  1062. static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info)
  1063. {
  1064. /* generate a complete frame */
  1065. DISPLAYLEVEL(3, "frame seed: %u\n", (unsigned)seed);
  1066. initFrame(fr);
  1067. writeFrameHeader(&seed, fr, info);
  1068. writeBlocks(&seed, fr, info);
  1069. writeChecksum(fr);
  1070. return seed;
  1071. }
  1072. /*_*******************************************************
  1073. * Dictionary Helper Functions
  1074. *********************************************************/
  1075. /* returns 0 if successful, otherwise returns 1 upon error */
  1076. static int genRandomDict(U32 dictID, U32 seed, size_t dictSize, BYTE* fullDict)
  1077. {
  1078. /* allocate space for samples */
  1079. int ret = 0;
  1080. unsigned const numSamples = 4;
  1081. size_t sampleSizes[4];
  1082. BYTE* const samples = malloc(5000*sizeof(BYTE));
  1083. if (samples == NULL) {
  1084. DISPLAY("Error: could not allocate space for samples\n");
  1085. return 1;
  1086. }
  1087. /* generate samples */
  1088. { unsigned literalValue = 1;
  1089. unsigned samplesPos = 0;
  1090. size_t currSize = 1;
  1091. while (literalValue <= 4) {
  1092. sampleSizes[literalValue - 1] = currSize;
  1093. { size_t k;
  1094. for (k = 0; k < currSize; k++) {
  1095. *(samples + (samplesPos++)) = (BYTE)literalValue;
  1096. } }
  1097. literalValue++;
  1098. currSize *= 16;
  1099. } }
  1100. { size_t dictWriteSize = 0;
  1101. ZDICT_params_t zdictParams;
  1102. size_t const headerSize = MAX(dictSize/4, 256);
  1103. size_t const dictContentSize = dictSize - headerSize;
  1104. BYTE* const dictContent = fullDict + headerSize;
  1105. if (dictContentSize < ZDICT_CONTENTSIZE_MIN || dictSize < ZDICT_DICTSIZE_MIN) {
  1106. DISPLAY("Error: dictionary size is too small\n");
  1107. ret = 1;
  1108. goto exitGenRandomDict;
  1109. }
  1110. /* init dictionary params */
  1111. memset(&zdictParams, 0, sizeof(zdictParams));
  1112. zdictParams.dictID = dictID;
  1113. zdictParams.notificationLevel = 1;
  1114. /* fill in dictionary content */
  1115. RAND_buffer(&seed, (void*)dictContent, dictContentSize);
  1116. /* finalize dictionary with random samples */
  1117. dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize,
  1118. dictContent, dictContentSize,
  1119. samples, sampleSizes, numSamples,
  1120. zdictParams);
  1121. if (ZDICT_isError(dictWriteSize)) {
  1122. DISPLAY("Could not finalize dictionary: %s\n", ZDICT_getErrorName(dictWriteSize));
  1123. ret = 1;
  1124. }
  1125. }
  1126. exitGenRandomDict:
  1127. free(samples);
  1128. return ret;
  1129. }
  1130. static dictInfo initDictInfo(int useDict, size_t dictContentSize, BYTE* dictContent, U32 dictID){
  1131. /* allocate space statically */
  1132. dictInfo dictOp;
  1133. memset(&dictOp, 0, sizeof(dictOp));
  1134. dictOp.useDict = useDict;
  1135. dictOp.dictContentSize = dictContentSize;
  1136. dictOp.dictContent = dictContent;
  1137. dictOp.dictID = dictID;
  1138. return dictOp;
  1139. }
  1140. /*-*******************************************************
  1141. * Test Mode
  1142. *********************************************************/
  1143. BYTE DECOMPRESSED_BUFFER[MAX_DECOMPRESSED_SIZE];
  1144. static size_t testDecodeSimple(frame_t* fr)
  1145. {
  1146. /* test decoding the generated data with the simple API */
  1147. size_t const ret = ZSTD_decompress(DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
  1148. fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
  1149. if (ZSTD_isError(ret)) return ret;
  1150. if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
  1151. (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
  1152. return ERROR(corruption_detected);
  1153. }
  1154. return ret;
  1155. }
  1156. static size_t testDecodeStreaming(frame_t* fr)
  1157. {
  1158. /* test decoding the generated data with the streaming API */
  1159. ZSTD_DStream* zd = ZSTD_createDStream();
  1160. ZSTD_inBuffer in;
  1161. ZSTD_outBuffer out;
  1162. size_t ret;
  1163. if (!zd) return ERROR(memory_allocation);
  1164. in.src = fr->dataStart;
  1165. in.pos = 0;
  1166. in.size = (BYTE*)fr->data - (BYTE*)fr->dataStart;
  1167. out.dst = DECOMPRESSED_BUFFER;
  1168. out.pos = 0;
  1169. out.size = ZSTD_DStreamOutSize();
  1170. ZSTD_initDStream(zd);
  1171. while (1) {
  1172. ret = ZSTD_decompressStream(zd, &out, &in);
  1173. if (ZSTD_isError(ret)) goto cleanup; /* error */
  1174. if (ret == 0) break; /* frame is done */
  1175. /* force decoding to be done in chunks */
  1176. out.size += MIN(ZSTD_DStreamOutSize(), MAX_DECOMPRESSED_SIZE - out.size);
  1177. }
  1178. ret = out.pos;
  1179. if (memcmp(out.dst, fr->srcStart, out.pos) != 0) {
  1180. return ERROR(corruption_detected);
  1181. }
  1182. cleanup:
  1183. ZSTD_freeDStream(zd);
  1184. return ret;
  1185. }
  1186. static size_t testDecodeWithDict(U32 seed, genType_e genType)
  1187. {
  1188. /* create variables */
  1189. size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN;
  1190. U32 const dictID = RAND(&seed);
  1191. size_t errorDetected = 0;
  1192. BYTE* const fullDict = malloc(dictSize);
  1193. if (fullDict == NULL) {
  1194. return ERROR(GENERIC);
  1195. }
  1196. /* generate random dictionary */
  1197. if (genRandomDict(dictID, seed, dictSize, fullDict)) { /* return 0 on success */
  1198. errorDetected = ERROR(GENERIC);
  1199. goto dictTestCleanup;
  1200. }
  1201. { frame_t fr;
  1202. dictInfo info;
  1203. ZSTD_DCtx* const dctx = ZSTD_createDCtx();
  1204. size_t ret;
  1205. /* get dict info */
  1206. { size_t const headerSize = MAX(dictSize/4, 256);
  1207. size_t const dictContentSize = dictSize-headerSize;
  1208. BYTE* const dictContent = fullDict+headerSize;
  1209. info = initDictInfo(1, dictContentSize, dictContent, dictID);
  1210. }
  1211. /* manually decompress and check difference */
  1212. if (genType == gt_frame) {
  1213. /* Test frame */
  1214. generateFrame(seed, &fr, info);
  1215. ret = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
  1216. fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart,
  1217. fullDict, dictSize);
  1218. } else {
  1219. /* Test block */
  1220. generateCompressedBlock(seed, &fr, info);
  1221. ret = ZSTD_decompressBegin_usingDict(dctx, fullDict, dictSize);
  1222. if (ZSTD_isError(ret)) {
  1223. errorDetected = ret;
  1224. ZSTD_freeDCtx(dctx);
  1225. goto dictTestCleanup;
  1226. }
  1227. ret = ZSTD_decompressBlock(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
  1228. fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart);
  1229. }
  1230. ZSTD_freeDCtx(dctx);
  1231. if (ZSTD_isError(ret)) {
  1232. errorDetected = ret;
  1233. goto dictTestCleanup;
  1234. }
  1235. if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) {
  1236. errorDetected = ERROR(corruption_detected);
  1237. goto dictTestCleanup;
  1238. }
  1239. }
  1240. dictTestCleanup:
  1241. free(fullDict);
  1242. return errorDetected;
  1243. }
  1244. static size_t testDecodeRawBlock(frame_t* fr)
  1245. {
  1246. ZSTD_DCtx* dctx = ZSTD_createDCtx();
  1247. size_t ret = ZSTD_decompressBegin(dctx);
  1248. if (ZSTD_isError(ret)) return ret;
  1249. ret = ZSTD_decompressBlock(
  1250. dctx,
  1251. DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
  1252. fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
  1253. ZSTD_freeDCtx(dctx);
  1254. if (ZSTD_isError(ret)) return ret;
  1255. if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
  1256. (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
  1257. return ERROR(corruption_detected);
  1258. }
  1259. return ret;
  1260. }
  1261. static int runBlockTest(U32* seed)
  1262. {
  1263. frame_t fr;
  1264. U32 const seedCopy = *seed;
  1265. { dictInfo const info = initDictInfo(0, 0, NULL, 0);
  1266. *seed = generateCompressedBlock(*seed, &fr, info);
  1267. }
  1268. { size_t const r = testDecodeRawBlock(&fr);
  1269. if (ZSTD_isError(r)) {
  1270. DISPLAY("Error in block mode on test seed %u: %s\n",
  1271. (unsigned)seedCopy, ZSTD_getErrorName(r));
  1272. return 1;
  1273. }
  1274. }
  1275. { size_t const r = testDecodeWithDict(*seed, gt_block);
  1276. if (ZSTD_isError(r)) {
  1277. DISPLAY("Error in block mode with dictionary on test seed %u: %s\n",
  1278. (unsigned)seedCopy, ZSTD_getErrorName(r));
  1279. return 1;
  1280. }
  1281. }
  1282. return 0;
  1283. }
  1284. static int runFrameTest(U32* seed)
  1285. {
  1286. frame_t fr;
  1287. U32 const seedCopy = *seed;
  1288. { dictInfo const info = initDictInfo(0, 0, NULL, 0);
  1289. *seed = generateFrame(*seed, &fr, info);
  1290. }
  1291. { size_t const r = testDecodeSimple(&fr);
  1292. if (ZSTD_isError(r)) {
  1293. DISPLAY("Error in simple mode on test seed %u: %s\n",
  1294. (unsigned)seedCopy, ZSTD_getErrorName(r));
  1295. return 1;
  1296. }
  1297. }
  1298. { size_t const r = testDecodeStreaming(&fr);
  1299. if (ZSTD_isError(r)) {
  1300. DISPLAY("Error in streaming mode on test seed %u: %s\n",
  1301. (unsigned)seedCopy, ZSTD_getErrorName(r));
  1302. return 1;
  1303. }
  1304. }
  1305. { size_t const r = testDecodeWithDict(*seed, gt_frame); /* avoid big dictionaries */
  1306. if (ZSTD_isError(r)) {
  1307. DISPLAY("Error in dictionary mode on test seed %u: %s\n",
  1308. (unsigned)seedCopy, ZSTD_getErrorName(r));
  1309. return 1;
  1310. }
  1311. }
  1312. return 0;
  1313. }
  1314. static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS,
  1315. genType_e genType)
  1316. {
  1317. unsigned fnum;
  1318. UTIL_time_t const startClock = UTIL_getTime();
  1319. U64 const maxClockSpan = testDurationS * SEC_TO_MICRO;
  1320. if (numFiles == 0 && !testDurationS) numFiles = 1;
  1321. DISPLAY("seed: %u\n", (unsigned)seed);
  1322. for (fnum = 0; fnum < numFiles || UTIL_clockSpanMicro(startClock) < maxClockSpan; fnum++) {
  1323. if (fnum < numFiles)
  1324. DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
  1325. else
  1326. DISPLAYUPDATE("\r%u ", fnum);
  1327. { int const ret = (genType == gt_frame) ?
  1328. runFrameTest(&seed) :
  1329. runBlockTest(&seed);
  1330. if (ret) return ret;
  1331. }
  1332. }
  1333. DISPLAY("\r%u tests completed: ", fnum);
  1334. DISPLAY("OK\n");
  1335. return 0;
  1336. }
  1337. /*-*******************************************************
  1338. * File I/O
  1339. *********************************************************/
  1340. static int generateFile(U32 seed, const char* const path,
  1341. const char* const origPath, genType_e genType)
  1342. {
  1343. frame_t fr;
  1344. DISPLAY("seed: %u\n", (unsigned)seed);
  1345. { dictInfo const info = initDictInfo(0, 0, NULL, 0);
  1346. if (genType == gt_frame) {
  1347. generateFrame(seed, &fr, info);
  1348. } else {
  1349. generateCompressedBlock(seed, &fr, info);
  1350. }
  1351. }
  1352. outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
  1353. if (origPath) {
  1354. outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
  1355. }
  1356. return 0;
  1357. }
  1358. static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
  1359. const char* const origPath, genType_e genType)
  1360. {
  1361. char outPath[MAX_PATH];
  1362. unsigned fnum;
  1363. DISPLAY("seed: %u\n", (unsigned)seed);
  1364. for (fnum = 0; fnum < numFiles; fnum++) {
  1365. frame_t fr;
  1366. DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
  1367. { dictInfo const info = initDictInfo(0, 0, NULL, 0);
  1368. if (genType == gt_frame) {
  1369. seed = generateFrame(seed, &fr, info);
  1370. } else {
  1371. seed = generateCompressedBlock(seed, &fr, info);
  1372. }
  1373. }
  1374. if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
  1375. DISPLAY("Error: path too long\n");
  1376. return 1;
  1377. }
  1378. outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
  1379. if (origPath) {
  1380. if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
  1381. DISPLAY("Error: path too long\n");
  1382. return 1;
  1383. }
  1384. outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
  1385. }
  1386. }
  1387. DISPLAY("\r%u/%u \n", fnum, numFiles);
  1388. return 0;
  1389. }
  1390. static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path,
  1391. const char* const origPath, const size_t dictSize,
  1392. genType_e genType)
  1393. {
  1394. char outPath[MAX_PATH];
  1395. BYTE* fullDict;
  1396. U32 const dictID = RAND(&seed);
  1397. int errorDetected = 0;
  1398. if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
  1399. DISPLAY("Error: path too long\n");
  1400. return 1;
  1401. }
  1402. /* allocate space for the dictionary */
  1403. fullDict = malloc(dictSize);
  1404. if (fullDict == NULL) {
  1405. DISPLAY("Error: could not allocate space for full dictionary.\n");
  1406. return 1;
  1407. }
  1408. /* randomly generate the dictionary */
  1409. { int const ret = genRandomDict(dictID, seed, dictSize, fullDict);
  1410. if (ret != 0) {
  1411. errorDetected = ret;
  1412. goto dictCleanup;
  1413. }
  1414. }
  1415. /* write out dictionary */
  1416. if (numFiles != 0) {
  1417. if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
  1418. DISPLAY("Error: dictionary path too long\n");
  1419. errorDetected = 1;
  1420. goto dictCleanup;
  1421. }
  1422. outputBuffer(fullDict, dictSize, outPath);
  1423. }
  1424. else {
  1425. outputBuffer(fullDict, dictSize, "dictionary");
  1426. }
  1427. /* generate random compressed/decompressed files */
  1428. { unsigned fnum;
  1429. for (fnum = 0; fnum < MAX(numFiles, 1); fnum++) {
  1430. frame_t fr;
  1431. DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
  1432. {
  1433. size_t const headerSize = MAX(dictSize/4, 256);
  1434. size_t const dictContentSize = dictSize-headerSize;
  1435. BYTE* const dictContent = fullDict+headerSize;
  1436. dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID);
  1437. if (genType == gt_frame) {
  1438. seed = generateFrame(seed, &fr, info);
  1439. } else {
  1440. seed = generateCompressedBlock(seed, &fr, info);
  1441. }
  1442. }
  1443. if (numFiles != 0) {
  1444. if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
  1445. DISPLAY("Error: path too long\n");
  1446. errorDetected = 1;
  1447. goto dictCleanup;
  1448. }
  1449. outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
  1450. if (origPath) {
  1451. if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
  1452. DISPLAY("Error: path too long\n");
  1453. errorDetected = 1;
  1454. goto dictCleanup;
  1455. }
  1456. outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
  1457. }
  1458. }
  1459. else {
  1460. outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
  1461. if (origPath) {
  1462. outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
  1463. }
  1464. }
  1465. }
  1466. }
  1467. dictCleanup:
  1468. free(fullDict);
  1469. return errorDetected;
  1470. }
  1471. /*_*******************************************************
  1472. * Command line
  1473. *********************************************************/
  1474. static U32 makeSeed(void)
  1475. {
  1476. U32 t = (U32) time(NULL);
  1477. return XXH32(&t, sizeof(t), 0) % 65536;
  1478. }
  1479. static unsigned readInt(const char** argument)
  1480. {
  1481. unsigned val = 0;
  1482. while ((**argument>='0') && (**argument<='9')) {
  1483. val *= 10;
  1484. val += **argument - '0';
  1485. (*argument)++;
  1486. }
  1487. return val;
  1488. }
  1489. static void usage(const char* programName)
  1490. {
  1491. DISPLAY( "Usage :\n");
  1492. DISPLAY( " %s [args]\n", programName);
  1493. DISPLAY( "\n");
  1494. DISPLAY( "Arguments :\n");
  1495. DISPLAY( " -p<path> : select output path (default:stdout)\n");
  1496. DISPLAY( " in multiple files mode this should be a directory\n");
  1497. DISPLAY( " -o<path> : select path to output original file (default:no output)\n");
  1498. DISPLAY( " in multiple files mode this should be a directory\n");
  1499. DISPLAY( " -s# : select seed (default:random based on time)\n");
  1500. DISPLAY( " -n# : number of files to generate (default:1)\n");
  1501. DISPLAY( " -t : activate test mode (test files against libzstd instead of outputting them)\n");
  1502. DISPLAY( " -T# : length of time to run tests for\n");
  1503. DISPLAY( " -v : increase verbosity level (default:0, max:7)\n");
  1504. DISPLAY( " -h/H : display help/long help and exit\n");
  1505. }
  1506. static void advancedUsage(const char* programName)
  1507. {
  1508. usage(programName);
  1509. DISPLAY( "\n");
  1510. DISPLAY( "Advanced arguments :\n");
  1511. DISPLAY( " --content-size : always include the content size in the frame header\n");
  1512. DISPLAY( " --use-dict=# : include a dictionary used to decompress the corpus\n");
  1513. DISPLAY( " --gen-blocks : generate raw compressed blocks without block/frame headers\n");
  1514. DISPLAY( " --max-block-size-log=# : max block size log, must be in range [2, 17]\n");
  1515. DISPLAY( " --max-content-size-log=# : max content size log, must be <= 20\n");
  1516. DISPLAY( " (this is ignored with gen-blocks)\n");
  1517. }
  1518. /*! readU32FromChar() :
  1519. @return : unsigned integer value read from input in `char` format
  1520. allows and interprets K, KB, KiB, M, MB and MiB suffix.
  1521. Will also modify `*stringPtr`, advancing it to position where it stopped reading.
  1522. Note : function result can overflow if digit string > MAX_UINT */
  1523. static unsigned readU32FromChar(const char** stringPtr)
  1524. {
  1525. unsigned result = 0;
  1526. while ((**stringPtr >='0') && (**stringPtr <='9'))
  1527. result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
  1528. if ((**stringPtr=='K') || (**stringPtr=='M')) {
  1529. result <<= 10;
  1530. if (**stringPtr=='M') result <<= 10;
  1531. (*stringPtr)++ ;
  1532. if (**stringPtr=='i') (*stringPtr)++;
  1533. if (**stringPtr=='B') (*stringPtr)++;
  1534. }
  1535. return result;
  1536. }
  1537. /** longCommandWArg() :
  1538. * check if *stringPtr is the same as longCommand.
  1539. * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
  1540. * @return 0 and doesn't modify *stringPtr otherwise.
  1541. */
  1542. static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
  1543. {
  1544. size_t const comSize = strlen(longCommand);
  1545. int const result = !strncmp(*stringPtr, longCommand, comSize);
  1546. if (result) *stringPtr += comSize;
  1547. return result;
  1548. }
  1549. int main(int argc, char** argv)
  1550. {
  1551. U32 seed = 0;
  1552. int seedset = 0;
  1553. unsigned numFiles = 0;
  1554. unsigned testDuration = 0;
  1555. int testMode = 0;
  1556. const char* path = NULL;
  1557. const char* origPath = NULL;
  1558. int useDict = 0;
  1559. unsigned dictSize = (10 << 10); /* 10 kB default */
  1560. genType_e genType = gt_frame;
  1561. int argNb;
  1562. /* Check command line */
  1563. for (argNb=1; argNb<argc; argNb++) {
  1564. const char* argument = argv[argNb];
  1565. if(!argument) continue; /* Protection if argument empty */
  1566. /* Handle commands. Aggregated commands are allowed */
  1567. if (argument[0]=='-') {
  1568. argument++;
  1569. while (*argument!=0) {
  1570. switch(*argument)
  1571. {
  1572. case 'h':
  1573. usage(argv[0]);
  1574. return 0;
  1575. case 'H':
  1576. advancedUsage(argv[0]);
  1577. return 0;
  1578. case 'v':
  1579. argument++;
  1580. g_displayLevel++;
  1581. break;
  1582. case 's':
  1583. argument++;
  1584. seedset=1;
  1585. seed = readInt(&argument);
  1586. break;
  1587. case 'n':
  1588. argument++;
  1589. numFiles = readInt(&argument);
  1590. break;
  1591. case 'T':
  1592. argument++;
  1593. testDuration = readInt(&argument);
  1594. if (*argument == 'm') {
  1595. testDuration *= 60;
  1596. argument++;
  1597. if (*argument == 'n') argument++;
  1598. }
  1599. break;
  1600. case 'o':
  1601. argument++;
  1602. origPath = argument;
  1603. argument += strlen(argument);
  1604. break;
  1605. case 'p':
  1606. argument++;
  1607. path = argument;
  1608. argument += strlen(argument);
  1609. break;
  1610. case 't':
  1611. argument++;
  1612. testMode = 1;
  1613. break;
  1614. case '-':
  1615. argument++;
  1616. if (strcmp(argument, "content-size") == 0) {
  1617. opts.contentSize = 1;
  1618. } else if (longCommandWArg(&argument, "use-dict=")) {
  1619. dictSize = readU32FromChar(&argument);
  1620. useDict = 1;
  1621. } else if (strcmp(argument, "gen-blocks") == 0) {
  1622. genType = gt_block;
  1623. } else if (longCommandWArg(&argument, "max-block-size-log=")) {
  1624. U32 value = readU32FromChar(&argument);
  1625. if (value >= 2 && value <= ZSTD_BLOCKSIZE_MAX) {
  1626. g_maxBlockSize = 1U << value;
  1627. }
  1628. } else if (longCommandWArg(&argument, "max-content-size-log=")) {
  1629. U32 value = readU32FromChar(&argument);
  1630. g_maxDecompressedSizeLog =
  1631. MIN(MAX_DECOMPRESSED_SIZE_LOG, value);
  1632. } else {
  1633. advancedUsage(argv[0]);
  1634. return 1;
  1635. }
  1636. argument += strlen(argument);
  1637. break;
  1638. default:
  1639. usage(argv[0]);
  1640. return 1;
  1641. } } } } /* for (argNb=1; argNb<argc; argNb++) */
  1642. if (!seedset) {
  1643. seed = makeSeed();
  1644. }
  1645. if (testMode) {
  1646. return runTestMode(seed, numFiles, testDuration, genType);
  1647. } else {
  1648. if (testDuration) {
  1649. DISPLAY("Error: -T requires test mode (-t)\n\n");
  1650. usage(argv[0]);
  1651. return 1;
  1652. }
  1653. }
  1654. if (!path) {
  1655. DISPLAY("Error: path is required in file generation mode\n");
  1656. usage(argv[0]);
  1657. return 1;
  1658. }
  1659. if (numFiles == 0 && useDict == 0) {
  1660. return generateFile(seed, path, origPath, genType);
  1661. } else if (useDict == 0){
  1662. return generateCorpus(seed, numFiles, path, origPath, genType);
  1663. } else {
  1664. /* should generate files with a dictionary */
  1665. return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize, genType);
  1666. }
  1667. }