123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386 |
- /* This file is part of the software similarity tester SIM.
- Written by Dick Grune, Vrije Universiteit, Amsterdam.
- $Id: hash.c,v 2.8 2005/02/20 17:03:00 dick Exp $
- */
- /* Text is compared by comparing every substring to all substrings
- to the right of it; this process is in essence quadratic. However,
- only substrings of length at least 'MinRunSize' are of interest,
- which gives us the possibility to speed up this process by using
- a hash table.
- For every position in the text, we construct an index which gives
- the next position in the text at which a run of MinRunSize tokens
- starts that has the same hash code, as calculated by hash1(). If
- there is no such run, the index is 0. These forward references are
- kept in the array forward_references[].
- To construct this array, we use a hash table last_index[] whose size
- is a prime and which is about 8 times smaller than the text array.
- The hash table last_index[] is set up such that last_index[i] is the
- index of the latest token with hash_code i, or 0 if there is none.
- This results in hash chains of an average length of 8. See
- MakeForwardReferences().
- If there is not enough room for a hash table of the proper size
- (which can be considerable) the hashing is not efficient any more.
- In that case, the forward reference table is scanned a second time,
- eliminating from any chain all references to runs that do not hash to
- the same value under a second hash function, hash2(). For the UNIX
- manuals this reduced the number of matches from 91.9% to 1.9% (of
- which 0.06% was genuine).
- */
- #include <stdio.h>
- #include <malloc.h>
- #include "system.par"
- #include "debug.par"
- #include "sim.h"
- #include "error.h"
- #include "language.h"
- #include "token.h"
- #include "tokenarray.h"
- #include "options.h"
- #include "hash.h"
- /* MAIN ENTRIES */
- static unsigned int *forward_references; /* to be filled by malloc() */
- static int n_forward_references;
- static void make_forward_references_hash1(void);
- static void make_forward_references_hash2(void);
- #ifdef DB_FORW_REF
- static void db_forward_references(const char *);
- static void make_forward_references_hash3(void);
- #endif
- void
- MakeForwardReferences(void) {
- /* Constructs the forward references table.
- */
- n_forward_references = TextLength();
- forward_references =
- (unsigned int *)calloc(
- n_forward_references, sizeof (unsigned int)
- );
- if (!forward_references) {
- fatal("out of memory");
- }
- make_forward_references_hash1();
- make_forward_references_hash2();
- #ifdef DB_FORW_REF
- make_forward_references_hash3();
- #endif
- }
- unsigned int
- ForwardReference(int i) {
- if (i <= 0 || i >= n_forward_references) {
- fatal("internal error, bad forward reference");
- }
- return forward_references[i];
- }
- void
- FreeForwardReferences(void) {
- free((char *)forward_references);
- }
- /* HASHING */
- /*
- We want a hash function whose time cost does not depend on
- MinRunSize, which is a problem since the size of the value
- we derive the hash function from IS equal to MinRunSize!
- Therefore we base the hash function on a sample of at most 24
- tokens from the input string; this works at least as well in
- practice. These 24 token values will result in exactly 31
- bits under the hashing algorithm used, which avoids an
- overflow test. So this 24 bears no relation to the default
- run size (although the fit is surprising!)
- */
- #define N_SAMPLES 24
- #define OPERATION ^
- /* An alternative algorithm; does not seem to make any difference.
- #define N_SAMPLES 23
- #define OPERATION +
- */
- /* Another algorithm; not yet tested
- #define N_SAMPLES 24
- #define OPERATION + 613 *
- */
- static unsigned int *last_index;
- static unsigned int hash_table_size;
- static int sample_pos[N_SAMPLES];
- static unsigned int
- prime[] = { /* lots of hopefully suitable primes */
- 10639,
- 21283,
- 42571,
- 85147,
- 170227,
- 340451,
- 680959,
- 1361803,
- 2723599,
- 5447171,
- 10894379,
- 21788719,
- 43577399,
- 87154759,
- 174309383,
- 348618827,
- 697237511,
- 1394475011
- };
- static void
- init_hash_table(void) {
- register int n;
- /* find the ideal hash table size */
- n = 0;
- while (prime[n] < TextLength()) {
- n++;
- /* this will always terminate, if prime[] is large enough */
- }
- /* see if we can allocate that much space, and if not, step down */
- last_index = 0;
- while (!last_index && n >= 0) {
- hash_table_size = prime[n];
- last_index = (unsigned int *)
- calloc(hash_table_size, sizeof (unsigned int));
- n--;
- }
- if (!last_index) {
- fatal("out of memory");
- }
-
- /* find sample positions */
- for (n = 0; n < N_SAMPLES; n++) {
- /* straigh-line approximation; uninituitive as usual */
- sample_pos[n] = (
- (2 * n * (MinRunSize - 1) + (N_SAMPLES - 1))
- / (2 * (N_SAMPLES - 1))
- );
- }
- }
- static int hash1(const TOKEN *);
- static void
- make_forward_references_hash1(void) {
- register int n;
- init_hash_table();
- /* set up the forward references using the last_index hash table */
- for (n = 0; n < NumberOfTexts; n++) {
- register struct text *txt = &Text[n];
- register unsigned int j;
- for ( /* all pos'ns in txt except the last MinRunSize-1 */
- j = txt->tx_start; /* >= 1 */
- j + MinRunSize - 1 < txt->tx_limit;
- j++
- ) {
- if (MayBeStartOfRun(TokenArray[j])) {
- register int h = hash1(&TokenArray[j]);
- if (last_index[h]) {
- forward_references[last_index[h]] = j;
- }
- last_index[h] = j;
- }
- }
- }
- free((char *)last_index);
- #ifdef DB_FORW_REF
- db_forward_references("first hashing");
- #endif /* DB_FORW_REF */
- }
- static int
- hash1(const TOKEN *p) {
- /* hash1(p) returns the hash code of the MinRunSize
- tokens starting at p; caller guarantees that there
- are at least MinRunSize tokens.
- */
- register int32 h_val;
- register int n;
-
- h_val = 0;
- for (n = 0; n < N_SAMPLES; n++) {
- h_val = (h_val << 1) OPERATION TOKEN2int(p[sample_pos[n]]);
- #if N_SAMPLES > 24
- if (h_val & (1<<31)) {
- h_val ^= (1<<31|1);
- }
- #endif
- }
- /* just in case somebody tries wrong N_SAMPLES and OPERATION values: */
- if (h_val < 0) fatal("corrupt hash algorithm in hash1() in hash.c");
- return h_val % hash_table_size;
- }
- static int hash2(const TOKEN *);
- static void
- make_forward_references_hash2(void) {
- register unsigned int i;
- /* do a second hash only if the original hash table was reduced */
- /* Meanwhile, the quality of the primary hashing is so bad
- that we are virtually forced to always do a second scan.
- */
- /* Clean out spurious matches, by a quadratic algorithm.
- Note that we do not want to eliminate overlapping
- sequences in this stage, since we might be removing the
- wrong copy.
- */
- for (i = 0; i+MinRunSize < TextLength(); i++) {
- register unsigned int j = i;
- register int h2 = hash2(&TokenArray[i]);
- /* Find the first token sequence in the chain
- with same secondary hash code.
- */
- while ( /* there is still a forward reference */
- (j = forward_references[j])
- && /* its hash code does not match */
- hash2(&TokenArray[j]) != h2
- ) {
- /* continue searching */
- }
- /* short-circuit forward reference to it, or to zero */
- forward_references[i] = j;
- }
- #ifdef DB_FORW_REF
- db_forward_references("second hashing");
- #endif /* DB_FORW_REF */
- }
- static int
- hash2(const TOKEN *p) {
- /* a simple-minded hashing for the secondary sweep;
- first and last token combined in a short int
- */
- return (TOKEN2int(p[0]) << 8) + TOKEN2int(p[MinRunSize-1]);
- }
- #ifdef DB_FORW_REF
- static int hash3(const TOKEN *, const TOKEN *);
- static void
- make_forward_references_hash3(void) {
- register unsigned int i;
- /* do a third hash to check up on the previous two */
- /* this time we use a genuine compare */
- for (i = 0; i+MinRunSize < TextLength(); i++) {
- register unsigned int j = i;
- while ( /* there is still a forward reference */
- (j = forward_references[j])
- && /* its hash code does not match */
- !hash3(&TokenArray[i], &TokenArray[j])
- ) {
- /* continue searching */
- }
- /* short-circuit forward reference to it, or to zero */
- forward_references[i] = j;
- }
- db_forward_references("third hashing");
- }
- static int
- hash3(const TOKEN *p, const TOKEN *q) {
- /* a full comparison for the tertiary sweep */
- int n;
-
- for (n = 0; n < MinRunSize; n++) {
- if (TOKEN2int(*(p+n)) != TOKEN2int(*(q+n))) return 0;
- }
- return 1;
- }
- static int
- db_frw_chain(int n, char *crossed_out) {
- register int chain_len = -1;
- /* if there are two values, the chain length is still 1 */
- register int fw;
- for (fw = n; fw; fw = forward_references[fw]) {
- if (crossed_out[fw]) {
- fprintf(DebugFile,
- ">>>> error in forward_references[] <<<<\n"
- );
- }
- chain_len++;
- crossed_out[fw]++;
- }
- fprintf(DebugFile, "n = %d, chain_len = %d\n", n, chain_len);
-
- return chain_len;
- }
- static void
- db_forward_references(const char *msg) {
- int n;
- int n_frw_chains = 0; /* number of forward ref. chains */
- int tot_frwc_len = 0;
- char *crossed_out;
- fprintf(DebugFile, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg);
- fprintf(DebugFile, "hash_table_size = %u\n", hash_table_size);
- fprintf(DebugFile, "N_SAMPLES = %d\n", N_SAMPLES);
- crossed_out = (char *)calloc(TextLength(), sizeof (char));
- if (!crossed_out) {
- fatal(">>>> no room for db_forward_references debug table <<<<\n");
- }
- /* Each forward_references[n] starts in principle a new
- chain, and these chains never touch each other.
- We check this property by marking the positions in each
- chain in an array; if we meet a marked entry while
- following a chain, it must have been on an earlier chain
- and we have an error.
- We also determine the lengths of the chains, for statistics.
- */
- if (forward_references[0]) {
- fprintf(DebugFile,
- ">>>> forward_references[0] is not zero <<<<\n"
- );
- }
- for (n = 1; n < TextLength(); n++) {
- if (forward_references[n] && !crossed_out[n]) {
- /* start of a new chain */
- n_frw_chains++;
- tot_frwc_len += db_frw_chain(n, crossed_out);
- }
- }
- free((char *)crossed_out);
- fprintf(DebugFile,
- "text length = %u, # forward chains = %d, total frw chain length = %d\n\n",
- TextLength(), n_frw_chains, tot_frwc_len
- );
- }
- #endif /* DB_FORW_REF */
|