lizard_parser_hashchain.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. #define LIZARD_HC_MIN_OFFSET 8
  2. #define LIZARD_HC_LONGOFF_MM 0 /* not used with offsets > 1<<16 */
  3. #define OPTIMAL_ML (int)((ML_MASK_LZ4-1)+MINMATCH)
  4. #define GET_MINMATCH(offset) (MINMATCH)
  5. #if 1
  6. #define LIZARD_HC_HASH_FUNCTION(ip, hashLog) Lizard_hashPtr(ip, hashLog, ctx->params.searchLength)
  7. #else
  8. #define LIZARD_HC_HASH_FUNCTION(ip, hashLog) Lizard_hash5Ptr(ip, hashLog)
  9. #endif
  10. /* Update chains up to ip (excluded) */
  11. FORCE_INLINE void Lizard_Insert (Lizard_stream_t* ctx, const BYTE* ip)
  12. {
  13. U32* const chainTable = ctx->chainTable;
  14. U32* const hashTable = ctx->hashTable;
  15. #if MINMATCH == 3
  16. U32* HashTable3 = ctx->hashTable3;
  17. #endif
  18. const BYTE* const base = ctx->base;
  19. U32 const target = (U32)(ip - base);
  20. U32 idx = ctx->nextToUpdate;
  21. const int hashLog = ctx->params.hashLog;
  22. const U32 contentMask = (1 << ctx->params.contentLog) - 1;
  23. const U32 maxDistance = (1 << ctx->params.windowLog) - 1;
  24. while (idx < target) {
  25. size_t const h = Lizard_hashPtr(base+idx, hashLog, ctx->params.searchLength);
  26. size_t delta = idx - hashTable[h];
  27. if (delta>maxDistance) delta = maxDistance;
  28. DELTANEXT(idx) = (U32)delta;
  29. if ((hashTable[h] >= idx) || (idx >= hashTable[h] + LIZARD_HC_MIN_OFFSET))
  30. hashTable[h] = idx;
  31. #if MINMATCH == 3
  32. HashTable3[Lizard_hash3Ptr(base+idx, ctx->params.hashLog3)] = idx;
  33. #endif
  34. idx++;
  35. }
  36. ctx->nextToUpdate = target;
  37. }
  38. FORCE_INLINE int Lizard_InsertAndFindBestMatch (Lizard_stream_t* ctx, /* Index table will be updated */
  39. const BYTE* ip, const BYTE* const iLimit,
  40. const BYTE** matchpos)
  41. {
  42. U32* const chainTable = ctx->chainTable;
  43. U32* const HashTable = ctx->hashTable;
  44. const BYTE* const base = ctx->base;
  45. const BYTE* const dictBase = ctx->dictBase;
  46. const U32 dictLimit = ctx->dictLimit;
  47. const BYTE* const lowPrefixPtr = base + dictLimit;
  48. const BYTE* const dictEnd = dictBase + dictLimit;
  49. U32 matchIndex, delta;
  50. const BYTE* match;
  51. int nbAttempts=ctx->params.searchNum;
  52. size_t ml=0;
  53. const int hashLog = ctx->params.hashLog;
  54. const U32 contentMask = (1 << ctx->params.contentLog) - 1;
  55. const U32 maxDistance = (1 << ctx->params.windowLog) - 1;
  56. const U32 current = (U32)(ip - base);
  57. const U32 lowLimit = (ctx->lowLimit + maxDistance >= current) ? ctx->lowLimit : current - maxDistance;
  58. /* HC4 match finder */
  59. Lizard_Insert(ctx, ip);
  60. matchIndex = HashTable[LIZARD_HC_HASH_FUNCTION(ip, hashLog)];
  61. while ((matchIndex < current) && (matchIndex >= lowLimit) && (nbAttempts)) {
  62. nbAttempts--;
  63. if (matchIndex >= dictLimit) {
  64. match = base + matchIndex;
  65. #if LIZARD_HC_MIN_OFFSET > 0
  66. if ((U32)(ip - match) >= LIZARD_HC_MIN_OFFSET)
  67. #endif
  68. if (*(match+ml) == *(ip+ml)
  69. && (MEM_read32(match) == MEM_read32(ip)))
  70. {
  71. size_t const mlt = Lizard_count(ip+MINMATCH, match+MINMATCH, iLimit) + MINMATCH;
  72. #if LIZARD_HC_LONGOFF_MM > 0
  73. if ((mlt >= LIZARD_HC_LONGOFF_MM) || ((U32)(ip - match) < LIZARD_MAX_16BIT_OFFSET))
  74. #endif
  75. if (mlt > ml) { ml = mlt; *matchpos = match; }
  76. }
  77. } else {
  78. match = dictBase + matchIndex;
  79. #if LIZARD_HC_MIN_OFFSET > 0
  80. if ((U32)(ip - (base + matchIndex)) >= LIZARD_HC_MIN_OFFSET)
  81. #endif
  82. if ((U32)((dictLimit-1) - matchIndex) >= 3) /* intentional overflow */
  83. if (MEM_read32(match) == MEM_read32(ip)) {
  84. size_t mlt = Lizard_count_2segments(ip+MINMATCH, match+MINMATCH, iLimit, dictEnd, lowPrefixPtr) + MINMATCH;
  85. #if LIZARD_HC_LONGOFF_MM > 0
  86. if ((mlt >= LIZARD_HC_LONGOFF_MM) || ((U32)(ip - (base + matchIndex)) < LIZARD_MAX_16BIT_OFFSET))
  87. #endif
  88. if (mlt > ml) { ml = mlt; *matchpos = base + matchIndex; } /* virtual matchpos */
  89. }
  90. }
  91. delta = DELTANEXT(matchIndex);
  92. if (delta > matchIndex) break;
  93. matchIndex -= delta;
  94. }
  95. return (int)ml;
  96. }
  97. FORCE_INLINE int Lizard_InsertAndGetWiderMatch (
  98. Lizard_stream_t* ctx,
  99. const BYTE* const ip,
  100. const BYTE* const iLowLimit,
  101. const BYTE* const iHighLimit,
  102. int longest,
  103. const BYTE** matchpos,
  104. const BYTE** startpos)
  105. {
  106. U32* const chainTable = ctx->chainTable;
  107. U32* const HashTable = ctx->hashTable;
  108. const BYTE* const base = ctx->base;
  109. const U32 dictLimit = ctx->dictLimit;
  110. const BYTE* const lowPrefixPtr = base + dictLimit;
  111. const BYTE* const dictBase = ctx->dictBase;
  112. const BYTE* const dictEnd = dictBase + dictLimit;
  113. U32 matchIndex, delta;
  114. int nbAttempts = ctx->params.searchNum;
  115. int LLdelta = (int)(ip-iLowLimit);
  116. const int hashLog = ctx->params.hashLog;
  117. const U32 contentMask = (1 << ctx->params.contentLog) - 1;
  118. const U32 maxDistance = (1 << ctx->params.windowLog) - 1;
  119. const U32 current = (U32)(ip - base);
  120. const U32 lowLimit = (ctx->lowLimit + maxDistance >= current) ? ctx->lowLimit : current - maxDistance;
  121. /* First Match */
  122. Lizard_Insert(ctx, ip);
  123. matchIndex = HashTable[LIZARD_HC_HASH_FUNCTION(ip, hashLog)];
  124. while ((matchIndex < current) && (matchIndex >= lowLimit) && (nbAttempts)) {
  125. nbAttempts--;
  126. if (matchIndex >= dictLimit) {
  127. const BYTE* match = base + matchIndex;
  128. #if LIZARD_HC_MIN_OFFSET > 0
  129. if ((U32)(ip - match) >= LIZARD_HC_MIN_OFFSET)
  130. #endif
  131. if (*(iLowLimit + longest) == *(match - LLdelta + longest)) {
  132. if (MEM_read32(match) == MEM_read32(ip)) {
  133. int mlt = MINMATCH + Lizard_count(ip+MINMATCH, match+MINMATCH, iHighLimit);
  134. int back = 0;
  135. while ((ip+back > iLowLimit) && (match+back > lowPrefixPtr) && (ip[back-1] == match[back-1])) back--;
  136. mlt -= back;
  137. #if LIZARD_HC_LONGOFF_MM > 0
  138. if ((mlt >= LIZARD_HC_LONGOFF_MM) || ((U32)(ip - match) < LIZARD_MAX_16BIT_OFFSET))
  139. #endif
  140. if (mlt > longest) {
  141. longest = (int)mlt;
  142. *matchpos = match+back;
  143. *startpos = ip+back;
  144. }
  145. }
  146. }
  147. } else {
  148. const BYTE* match = dictBase + matchIndex;
  149. #if LIZARD_HC_MIN_OFFSET > 0
  150. if ((U32)(ip - (base + matchIndex)) >= LIZARD_HC_MIN_OFFSET)
  151. #endif
  152. if ((U32)((dictLimit-1) - matchIndex) >= 3) /* intentional overflow */
  153. if (MEM_read32(match) == MEM_read32(ip)) {
  154. int back=0;
  155. size_t mlt = Lizard_count_2segments(ip+MINMATCH, match+MINMATCH, iHighLimit, dictEnd, lowPrefixPtr) + MINMATCH;
  156. while ((ip+back > iLowLimit) && (matchIndex+back > lowLimit) && (ip[back-1] == match[back-1])) back--;
  157. mlt -= back;
  158. #if LIZARD_HC_LONGOFF_MM > 0
  159. if ((mlt >= LIZARD_HC_LONGOFF_MM) || ((U32)(ip - (base + matchIndex)) < LIZARD_MAX_16BIT_OFFSET))
  160. #endif
  161. if ((int)mlt > longest) { longest = (int)mlt; *matchpos = base + matchIndex + back; *startpos = ip+back; }
  162. }
  163. }
  164. delta = DELTANEXT(matchIndex);
  165. if (delta > matchIndex) break;
  166. matchIndex -= delta;
  167. }
  168. return longest;
  169. }
  170. FORCE_INLINE int Lizard_compress_hashChain (
  171. Lizard_stream_t* const ctx,
  172. const BYTE* ip,
  173. const BYTE* const iend)
  174. {
  175. const BYTE* anchor = ip;
  176. const BYTE* const mflimit = iend - MFLIMIT;
  177. const BYTE* const matchlimit = (iend - LASTLITERALS);
  178. int ml, ml2, ml3, ml0;
  179. const BYTE* ref = NULL;
  180. const BYTE* start2 = NULL;
  181. const BYTE* ref2 = NULL;
  182. const BYTE* start3 = NULL;
  183. const BYTE* ref3 = NULL;
  184. const BYTE* start0;
  185. const BYTE* ref0;
  186. /* init */
  187. ip++;
  188. /* Main Loop */
  189. while (ip < mflimit) {
  190. ml = Lizard_InsertAndFindBestMatch (ctx, ip, matchlimit, (&ref));
  191. if (!ml) { ip++; continue; }
  192. /* saved, in case we would skip too much */
  193. start0 = ip;
  194. ref0 = ref;
  195. ml0 = ml;
  196. _Search2:
  197. if (ip+ml < mflimit)
  198. ml2 = Lizard_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 1, matchlimit, ml, &ref2, &start2);
  199. else ml2 = ml;
  200. if (ml2 == ml) { /* No better match */
  201. if (Lizard_encodeSequence_LZ4(ctx, &ip, &anchor, ml, ref)) return 0;
  202. continue;
  203. }
  204. if (start0 < ip) {
  205. if (start2 < ip + ml0) { /* empirical */
  206. ip = start0;
  207. ref = ref0;
  208. ml = ml0;
  209. }
  210. }
  211. /* Here, start0==ip */
  212. if ((start2 - ip) < 3) { /* First Match too small : removed */
  213. ml = ml2;
  214. ip = start2;
  215. ref =ref2;
  216. goto _Search2;
  217. }
  218. _Search3:
  219. /*
  220. * Currently we have :
  221. * ml2 > ml1, and
  222. * ip1+3 <= ip2 (usually < ip1+ml1)
  223. */
  224. if ((start2 - ip) < OPTIMAL_ML) {
  225. int correction;
  226. int new_ml = ml;
  227. if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML;
  228. if (ip+new_ml > start2 + ml2 - GET_MINMATCH((U32)(start2 - ref2))) {
  229. new_ml = (int)(start2 - ip) + ml2 - GET_MINMATCH((U32)(start2 - ref2));
  230. if (new_ml < GET_MINMATCH((U32)(ip - ref))) { // match2 doesn't fit
  231. if (Lizard_encodeSequence_LZ4(ctx, &ip, &anchor, ml, ref)) return 0;
  232. continue;
  233. }
  234. }
  235. correction = new_ml - (int)(start2 - ip);
  236. if (correction > 0) {
  237. start2 += correction;
  238. ref2 += correction;
  239. ml2 -= correction;
  240. }
  241. }
  242. /* Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18) */
  243. if (start2 + ml2 < mflimit)
  244. ml3 = Lizard_InsertAndGetWiderMatch(ctx, start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3);
  245. else ml3 = ml2;
  246. if (ml3 == ml2) { /* No better match : 2 sequences to encode */
  247. /* ip & ref are known; Now for ml */
  248. if (start2 < ip+ml) ml = (int)(start2 - ip);
  249. /* Now, encode 2 sequences */
  250. if (Lizard_encodeSequence_LZ4(ctx, &ip, &anchor, ml, ref)) return 0;
  251. ip = start2;
  252. if (Lizard_encodeSequence_LZ4(ctx, &ip, &anchor, ml2, ref2)) return 0;
  253. continue;
  254. }
  255. if (start3 < ip+ml+3) { /* Not enough space for match 2 : remove it */
  256. if (start3 >= (ip+ml)) { /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */
  257. if (start2 < ip+ml) {
  258. int correction = (int)(ip+ml - start2);
  259. start2 += correction;
  260. ref2 += correction;
  261. ml2 -= correction;
  262. if (ml2 < GET_MINMATCH((U32)(start2 - ref2))) {
  263. start2 = start3;
  264. ref2 = ref3;
  265. ml2 = ml3;
  266. }
  267. }
  268. if (Lizard_encodeSequence_LZ4(ctx, &ip, &anchor, ml, ref)) return 0;
  269. ip = start3;
  270. ref = ref3;
  271. ml = ml3;
  272. start0 = start2;
  273. ref0 = ref2;
  274. ml0 = ml2;
  275. goto _Search2;
  276. }
  277. start2 = start3;
  278. ref2 = ref3;
  279. ml2 = ml3;
  280. goto _Search3;
  281. }
  282. /*
  283. * OK, now we have 3 ascending matches; let's write at least the first one
  284. * ip & ref are known; Now for ml
  285. */
  286. if (start2 < ip+ml) {
  287. if ((start2 - ip) < (int)ML_MASK_LZ4) {
  288. int correction;
  289. if (ml > OPTIMAL_ML) ml = OPTIMAL_ML;
  290. if (ip + ml > start2 + ml2 - GET_MINMATCH((U32)(start2 - ref2))) {
  291. ml = (int)(start2 - ip) + ml2 - GET_MINMATCH((U32)(start2 - ref2));
  292. if (ml < GET_MINMATCH((U32)(ip - ref))) { // match2 doesn't fit, remove it
  293. if (Lizard_encodeSequence_LZ4(ctx, &ip, &anchor, ml, ref)) return 0;
  294. ip = start3;
  295. ref = ref3;
  296. ml = ml3;
  297. start0 = start2;
  298. ref0 = ref2;
  299. ml0 = ml2;
  300. goto _Search2;
  301. }
  302. }
  303. correction = ml - (int)(start2 - ip);
  304. if (correction > 0) {
  305. start2 += correction;
  306. ref2 += correction;
  307. ml2 -= correction;
  308. }
  309. } else {
  310. ml = (int)(start2 - ip);
  311. }
  312. }
  313. if (Lizard_encodeSequence_LZ4(ctx, &ip, &anchor, ml, ref)) return 0;
  314. ip = start2;
  315. ref = ref2;
  316. ml = ml2;
  317. start2 = start3;
  318. ref2 = ref3;
  319. ml2 = ml3;
  320. goto _Search3;
  321. }
  322. /* Encode Last Literals */
  323. ip = iend;
  324. if (Lizard_encodeLastLiterals_LZ4(ctx, &ip, &anchor)) goto _output_error;
  325. /* End */
  326. return 1;
  327. _output_error:
  328. return 0;
  329. }