hash.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. /* This file is part of the software similarity tester SIM.
  2. Written by Dick Grune, Vrije Universiteit, Amsterdam.
  3. $Id: hash.c,v 2.8 2005/02/20 17:03:00 dick Exp $
  4. */
  5. /* Text is compared by comparing every substring to all substrings
  6. to the right of it; this process is in essence quadratic. However,
  7. only substrings of length at least 'MinRunSize' are of interest,
  8. which gives us the possibility to speed up this process by using
  9. a hash table.
  10. For every position in the text, we construct an index which gives
  11. the next position in the text at which a run of MinRunSize tokens
  12. starts that has the same hash code, as calculated by hash1(). If
  13. there is no such run, the index is 0. These forward references are
  14. kept in the array forward_references[].
  15. To construct this array, we use a hash table last_index[] whose size
  16. is a prime and which is about 8 times smaller than the text array.
  17. The hash table last_index[] is set up such that last_index[i] is the
  18. index of the latest token with hash_code i, or 0 if there is none.
  19. This results in hash chains of an average length of 8. See
  20. MakeForwardReferences().
  21. If there is not enough room for a hash table of the proper size
  22. (which can be considerable) the hashing is not efficient any more.
  23. In that case, the forward reference table is scanned a second time,
  24. eliminating from any chain all references to runs that do not hash to
  25. the same value under a second hash function, hash2(). For the UNIX
  26. manuals this reduced the number of matches from 91.9% to 1.9% (of
  27. which 0.06% was genuine).
  28. */
  29. #include <stdio.h>
  30. #include <malloc.h>
  31. #include "system.par"
  32. #include "debug.par"
  33. #include "sim.h"
  34. #include "error.h"
  35. #include "language.h"
  36. #include "token.h"
  37. #include "tokenarray.h"
  38. #include "options.h"
  39. #include "hash.h"
  40. /* MAIN ENTRIES */
  41. static unsigned int *forward_references; /* to be filled by malloc() */
  42. static int n_forward_references;
  43. static void make_forward_references_hash1(void);
  44. static void make_forward_references_hash2(void);
  45. #ifdef DB_FORW_REF
  46. static void db_forward_references(const char *);
  47. static void make_forward_references_hash3(void);
  48. #endif
  49. void
  50. MakeForwardReferences(void) {
  51. /* Constructs the forward references table.
  52. */
  53. n_forward_references = TextLength();
  54. forward_references =
  55. (unsigned int *)calloc(
  56. n_forward_references, sizeof (unsigned int)
  57. );
  58. if (!forward_references) {
  59. fatal("out of memory");
  60. }
  61. make_forward_references_hash1();
  62. make_forward_references_hash2();
  63. #ifdef DB_FORW_REF
  64. make_forward_references_hash3();
  65. #endif
  66. }
  67. unsigned int
  68. ForwardReference(int i) {
  69. if (i <= 0 || i >= n_forward_references) {
  70. fatal("internal error, bad forward reference");
  71. }
  72. return forward_references[i];
  73. }
  74. void
  75. FreeForwardReferences(void) {
  76. free((char *)forward_references);
  77. }
  78. /* HASHING */
  79. /*
  80. We want a hash function whose time cost does not depend on
  81. MinRunSize, which is a problem since the size of the value
  82. we derive the hash function from IS equal to MinRunSize!
  83. Therefore we base the hash function on a sample of at most 24
  84. tokens from the input string; this works at least as well in
  85. practice. These 24 token values will result in exactly 31
  86. bits under the hashing algorithm used, which avoids an
  87. overflow test. So this 24 bears no relation to the default
  88. run size (although the fit is surprising!)
  89. */
  90. #define N_SAMPLES 24
  91. #define OPERATION ^
  92. /* An alternative algorithm; does not seem to make any difference.
  93. #define N_SAMPLES 23
  94. #define OPERATION +
  95. */
  96. /* Another algorithm; not yet tested
  97. #define N_SAMPLES 24
  98. #define OPERATION + 613 *
  99. */
  100. static unsigned int *last_index;
  101. static unsigned int hash_table_size;
  102. static int sample_pos[N_SAMPLES];
  103. static unsigned int
  104. prime[] = { /* lots of hopefully suitable primes */
  105. 10639,
  106. 21283,
  107. 42571,
  108. 85147,
  109. 170227,
  110. 340451,
  111. 680959,
  112. 1361803,
  113. 2723599,
  114. 5447171,
  115. 10894379,
  116. 21788719,
  117. 43577399,
  118. 87154759,
  119. 174309383,
  120. 348618827,
  121. 697237511,
  122. 1394475011
  123. };
  124. static void
  125. init_hash_table(void) {
  126. register int n;
  127. /* find the ideal hash table size */
  128. n = 0;
  129. while (prime[n] < TextLength()) {
  130. n++;
  131. /* this will always terminate, if prime[] is large enough */
  132. }
  133. /* see if we can allocate that much space, and if not, step down */
  134. last_index = 0;
  135. while (!last_index && n >= 0) {
  136. hash_table_size = prime[n];
  137. last_index = (unsigned int *)
  138. calloc(hash_table_size, sizeof (unsigned int));
  139. n--;
  140. }
  141. if (!last_index) {
  142. fatal("out of memory");
  143. }
  144. /* find sample positions */
  145. for (n = 0; n < N_SAMPLES; n++) {
  146. /* straigh-line approximation; uninituitive as usual */
  147. sample_pos[n] = (
  148. (2 * n * (MinRunSize - 1) + (N_SAMPLES - 1))
  149. / (2 * (N_SAMPLES - 1))
  150. );
  151. }
  152. }
  153. static int hash1(const TOKEN *);
  154. static void
  155. make_forward_references_hash1(void) {
  156. register int n;
  157. init_hash_table();
  158. /* set up the forward references using the last_index hash table */
  159. for (n = 0; n < NumberOfTexts; n++) {
  160. register struct text *txt = &Text[n];
  161. register unsigned int j;
  162. for ( /* all pos'ns in txt except the last MinRunSize-1 */
  163. j = txt->tx_start; /* >= 1 */
  164. j + MinRunSize - 1 < txt->tx_limit;
  165. j++
  166. ) {
  167. if (MayBeStartOfRun(TokenArray[j])) {
  168. register int h = hash1(&TokenArray[j]);
  169. if (last_index[h]) {
  170. forward_references[last_index[h]] = j;
  171. }
  172. last_index[h] = j;
  173. }
  174. }
  175. }
  176. free((char *)last_index);
  177. #ifdef DB_FORW_REF
  178. db_forward_references("first hashing");
  179. #endif /* DB_FORW_REF */
  180. }
  181. static int
  182. hash1(const TOKEN *p) {
  183. /* hash1(p) returns the hash code of the MinRunSize
  184. tokens starting at p; caller guarantees that there
  185. are at least MinRunSize tokens.
  186. */
  187. register int32 h_val;
  188. register int n;
  189. h_val = 0;
  190. for (n = 0; n < N_SAMPLES; n++) {
  191. h_val = (h_val << 1) OPERATION TOKEN2int(p[sample_pos[n]]);
  192. #if N_SAMPLES > 24
  193. if (h_val & (1<<31)) {
  194. h_val ^= (1<<31|1);
  195. }
  196. #endif
  197. }
  198. /* just in case somebody tries wrong N_SAMPLES and OPERATION values: */
  199. if (h_val < 0) fatal("corrupt hash algorithm in hash1() in hash.c");
  200. return h_val % hash_table_size;
  201. }
  202. static int hash2(const TOKEN *);
  203. static void
  204. make_forward_references_hash2(void) {
  205. register unsigned int i;
  206. /* do a second hash only if the original hash table was reduced */
  207. /* Meanwhile, the quality of the primary hashing is so bad
  208. that we are virtually forced to always do a second scan.
  209. */
  210. /* Clean out spurious matches, by a quadratic algorithm.
  211. Note that we do not want to eliminate overlapping
  212. sequences in this stage, since we might be removing the
  213. wrong copy.
  214. */
  215. for (i = 0; i+MinRunSize < TextLength(); i++) {
  216. register unsigned int j = i;
  217. register int h2 = hash2(&TokenArray[i]);
  218. /* Find the first token sequence in the chain
  219. with same secondary hash code.
  220. */
  221. while ( /* there is still a forward reference */
  222. (j = forward_references[j])
  223. && /* its hash code does not match */
  224. hash2(&TokenArray[j]) != h2
  225. ) {
  226. /* continue searching */
  227. }
  228. /* short-circuit forward reference to it, or to zero */
  229. forward_references[i] = j;
  230. }
  231. #ifdef DB_FORW_REF
  232. db_forward_references("second hashing");
  233. #endif /* DB_FORW_REF */
  234. }
  235. static int
  236. hash2(const TOKEN *p) {
  237. /* a simple-minded hashing for the secondary sweep;
  238. first and last token combined in a short int
  239. */
  240. return (TOKEN2int(p[0]) << 8) + TOKEN2int(p[MinRunSize-1]);
  241. }
  242. #ifdef DB_FORW_REF
  243. static int hash3(const TOKEN *, const TOKEN *);
  244. static void
  245. make_forward_references_hash3(void) {
  246. register unsigned int i;
  247. /* do a third hash to check up on the previous two */
  248. /* this time we use a genuine compare */
  249. for (i = 0; i+MinRunSize < TextLength(); i++) {
  250. register unsigned int j = i;
  251. while ( /* there is still a forward reference */
  252. (j = forward_references[j])
  253. && /* its hash code does not match */
  254. !hash3(&TokenArray[i], &TokenArray[j])
  255. ) {
  256. /* continue searching */
  257. }
  258. /* short-circuit forward reference to it, or to zero */
  259. forward_references[i] = j;
  260. }
  261. db_forward_references("third hashing");
  262. }
  263. static int
  264. hash3(const TOKEN *p, const TOKEN *q) {
  265. /* a full comparison for the tertiary sweep */
  266. int n;
  267. for (n = 0; n < MinRunSize; n++) {
  268. if (TOKEN2int(*(p+n)) != TOKEN2int(*(q+n))) return 0;
  269. }
  270. return 1;
  271. }
  272. static int
  273. db_frw_chain(int n, char *crossed_out) {
  274. register int chain_len = -1;
  275. /* if there are two values, the chain length is still 1 */
  276. register int fw;
  277. for (fw = n; fw; fw = forward_references[fw]) {
  278. if (crossed_out[fw]) {
  279. fprintf(DebugFile,
  280. ">>>> error in forward_references[] <<<<\n"
  281. );
  282. }
  283. chain_len++;
  284. crossed_out[fw]++;
  285. }
  286. fprintf(DebugFile, "n = %d, chain_len = %d\n", n, chain_len);
  287. return chain_len;
  288. }
  289. static void
  290. db_forward_references(const char *msg) {
  291. int n;
  292. int n_frw_chains = 0; /* number of forward ref. chains */
  293. int tot_frwc_len = 0;
  294. char *crossed_out;
  295. fprintf(DebugFile, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg);
  296. fprintf(DebugFile, "hash_table_size = %u\n", hash_table_size);
  297. fprintf(DebugFile, "N_SAMPLES = %d\n", N_SAMPLES);
  298. crossed_out = (char *)calloc(TextLength(), sizeof (char));
  299. if (!crossed_out) {
  300. fatal(">>>> no room for db_forward_references debug table <<<<\n");
  301. }
  302. /* Each forward_references[n] starts in principle a new
  303. chain, and these chains never touch each other.
  304. We check this property by marking the positions in each
  305. chain in an array; if we meet a marked entry while
  306. following a chain, it must have been on an earlier chain
  307. and we have an error.
  308. We also determine the lengths of the chains, for statistics.
  309. */
  310. if (forward_references[0]) {
  311. fprintf(DebugFile,
  312. ">>>> forward_references[0] is not zero <<<<\n"
  313. );
  314. }
  315. for (n = 1; n < TextLength(); n++) {
  316. if (forward_references[n] && !crossed_out[n]) {
  317. /* start of a new chain */
  318. n_frw_chains++;
  319. tot_frwc_len += db_frw_chain(n, crossed_out);
  320. }
  321. }
  322. free((char *)crossed_out);
  323. fprintf(DebugFile,
  324. "text length = %u, # forward chains = %d, total frw chain length = %d\n\n",
  325. TextLength(), n_frw_chains, tot_frwc_len
  326. );
  327. }
  328. #endif /* DB_FORW_REF */