mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-05 06:57:59 +02:00
387 lines
9.8 KiB
C
387 lines
9.8 KiB
C
/* This file is part of the software similarity tester SIM.
|
|
Written by Dick Grune, Vrije Universiteit, Amsterdam.
|
|
$Id: hash.c,v 2.8 2005/02/20 17:03:00 dick Exp $
|
|
*/
|
|
|
|
/* Text is compared by comparing every substring to all substrings
|
|
to the right of it; this process is in essence quadratic. However,
|
|
only substrings of length at least 'MinRunSize' are of interest,
|
|
which gives us the possibility to speed up this process by using
|
|
a hash table.
|
|
|
|
For every position in the text, we construct an index which gives
|
|
the next position in the text at which a run of MinRunSize tokens
|
|
starts that has the same hash code, as calculated by hash1(). If
|
|
there is no such run, the index is 0. These forward references are
|
|
kept in the array forward_references[].
|
|
|
|
To construct this array, we use a hash table last_index[] whose size
|
|
is a prime and which is about 8 times smaller than the text array.
|
|
The hash table last_index[] is set up such that last_index[i] is the
|
|
index of the latest token with hash_code i, or 0 if there is none.
|
|
This results in hash chains of an average length of 8. See
|
|
MakeForwardReferences().
|
|
|
|
If there is not enough room for a hash table of the proper size
|
|
(which can be considerable) the hashing is not efficient any more.
|
|
In that case, the forward reference table is scanned a second time,
|
|
eliminating from any chain all references to runs that do not hash to
|
|
the same value under a second hash function, hash2(). For the UNIX
|
|
manuals this reduced the number of matches from 91.9% to 1.9% (of
|
|
which 0.06% was genuine).
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <malloc.h>
|
|
|
|
#include "system.par"
|
|
#include "debug.par"
|
|
#include "sim.h"
|
|
#include "error.h"
|
|
#include "language.h"
|
|
#include "token.h"
|
|
#include "tokenarray.h"
|
|
#include "options.h"
|
|
#include "hash.h"
|
|
|
|
/* MAIN ENTRIES */
|
|
static unsigned int *forward_references; /* to be filled by malloc() */
|
|
static int n_forward_references;
|
|
|
|
static void make_forward_references_hash1(void);
|
|
static void make_forward_references_hash2(void);
|
|
|
|
#ifdef DB_FORW_REF
|
|
static void db_forward_references(const char *);
|
|
static void make_forward_references_hash3(void);
|
|
#endif
|
|
|
|
void
|
|
MakeForwardReferences(void) {
|
|
/* Constructs the forward references table.
|
|
*/
|
|
|
|
n_forward_references = TextLength();
|
|
forward_references =
|
|
(unsigned int *)calloc(
|
|
n_forward_references, sizeof (unsigned int)
|
|
);
|
|
if (!forward_references) {
|
|
fatal("out of memory");
|
|
}
|
|
make_forward_references_hash1();
|
|
make_forward_references_hash2();
|
|
#ifdef DB_FORW_REF
|
|
make_forward_references_hash3();
|
|
#endif
|
|
}
|
|
|
|
unsigned int
|
|
ForwardReference(int i) {
|
|
if (i <= 0 || i >= n_forward_references) {
|
|
fatal("internal error, bad forward reference");
|
|
}
|
|
return forward_references[i];
|
|
}
|
|
|
|
void
|
|
FreeForwardReferences(void) {
|
|
free((char *)forward_references);
|
|
}
|
|
|
|
/* HASHING */
|
|
/*
|
|
We want a hash function whose time cost does not depend on
|
|
MinRunSize, which is a problem since the size of the value
|
|
we derive the hash function from IS equal to MinRunSize!
|
|
Therefore we base the hash function on a sample of at most 24
|
|
tokens from the input string; this works at least as well in
|
|
practice. These 24 token values will result in exactly 31
|
|
bits under the hashing algorithm used, which avoids an
|
|
overflow test. So this 24 bears no relation to the default
|
|
run size (although the fit is surprising!)
|
|
*/
|
|
|
|
#define N_SAMPLES 24
|
|
#define OPERATION ^
|
|
|
|
/* An alternative algorithm; does not seem to make any difference.
|
|
#define N_SAMPLES 23
|
|
#define OPERATION +
|
|
*/
|
|
|
|
/* Another algorithm; not yet tested
|
|
#define N_SAMPLES 24
|
|
#define OPERATION + 613 *
|
|
*/
|
|
|
|
static unsigned int *last_index;
|
|
static unsigned int hash_table_size;
|
|
static int sample_pos[N_SAMPLES];
|
|
|
|
static unsigned int
|
|
prime[] = { /* lots of hopefully suitable primes */
|
|
10639,
|
|
21283,
|
|
42571,
|
|
85147,
|
|
170227,
|
|
340451,
|
|
680959,
|
|
1361803,
|
|
2723599,
|
|
5447171,
|
|
10894379,
|
|
21788719,
|
|
43577399,
|
|
87154759,
|
|
174309383,
|
|
348618827,
|
|
697237511,
|
|
1394475011
|
|
};
|
|
|
|
static void
|
|
init_hash_table(void) {
|
|
register int n;
|
|
|
|
/* find the ideal hash table size */
|
|
n = 0;
|
|
while (prime[n] < TextLength()) {
|
|
n++;
|
|
/* this will always terminate, if prime[] is large enough */
|
|
}
|
|
|
|
/* see if we can allocate that much space, and if not, step down */
|
|
last_index = 0;
|
|
while (!last_index && n >= 0) {
|
|
hash_table_size = prime[n];
|
|
last_index = (unsigned int *)
|
|
calloc(hash_table_size, sizeof (unsigned int));
|
|
n--;
|
|
}
|
|
if (!last_index) {
|
|
fatal("out of memory");
|
|
}
|
|
|
|
/* find sample positions */
|
|
for (n = 0; n < N_SAMPLES; n++) {
|
|
/* straigh-line approximation; uninituitive as usual */
|
|
sample_pos[n] = (
|
|
(2 * n * (MinRunSize - 1) + (N_SAMPLES - 1))
|
|
/ (2 * (N_SAMPLES - 1))
|
|
);
|
|
}
|
|
}
|
|
|
|
static int hash1(const TOKEN *);
|
|
|
|
static void
|
|
make_forward_references_hash1(void) {
|
|
register int n;
|
|
|
|
init_hash_table();
|
|
|
|
/* set up the forward references using the last_index hash table */
|
|
for (n = 0; n < NumberOfTexts; n++) {
|
|
register struct text *txt = &Text[n];
|
|
register unsigned int j;
|
|
|
|
for ( /* all pos'ns in txt except the last MinRunSize-1 */
|
|
j = txt->tx_start; /* >= 1 */
|
|
j + MinRunSize - 1 < txt->tx_limit;
|
|
j++
|
|
) {
|
|
if (MayBeStartOfRun(TokenArray[j])) {
|
|
register int h = hash1(&TokenArray[j]);
|
|
|
|
if (last_index[h]) {
|
|
forward_references[last_index[h]] = j;
|
|
}
|
|
last_index[h] = j;
|
|
}
|
|
}
|
|
}
|
|
free((char *)last_index);
|
|
|
|
#ifdef DB_FORW_REF
|
|
db_forward_references("first hashing");
|
|
#endif /* DB_FORW_REF */
|
|
}
|
|
|
|
static int
|
|
hash1(const TOKEN *p) {
|
|
/* hash1(p) returns the hash code of the MinRunSize
|
|
tokens starting at p; caller guarantees that there
|
|
are at least MinRunSize tokens.
|
|
*/
|
|
register int32 h_val;
|
|
register int n;
|
|
|
|
h_val = 0;
|
|
for (n = 0; n < N_SAMPLES; n++) {
|
|
h_val = (h_val << 1) OPERATION TOKEN2int(p[sample_pos[n]]);
|
|
#if N_SAMPLES > 24
|
|
if (h_val & (1<<31)) {
|
|
h_val ^= (1<<31|1);
|
|
}
|
|
#endif
|
|
}
|
|
/* just in case somebody tries wrong N_SAMPLES and OPERATION values: */
|
|
if (h_val < 0) fatal("corrupt hash algorithm in hash1() in hash.c");
|
|
|
|
return h_val % hash_table_size;
|
|
}
|
|
|
|
static int hash2(const TOKEN *);
|
|
|
|
static void
|
|
make_forward_references_hash2(void) {
|
|
register unsigned int i;
|
|
|
|
/* do a second hash only if the original hash table was reduced */
|
|
/* Meanwhile, the quality of the primary hashing is so bad
|
|
that we are virtually forced to always do a second scan.
|
|
*/
|
|
|
|
/* Clean out spurious matches, by a quadratic algorithm.
|
|
Note that we do not want to eliminate overlapping
|
|
sequences in this stage, since we might be removing the
|
|
wrong copy.
|
|
*/
|
|
for (i = 0; i+MinRunSize < TextLength(); i++) {
|
|
register unsigned int j = i;
|
|
register int h2 = hash2(&TokenArray[i]);
|
|
|
|
/* Find the first token sequence in the chain
|
|
with same secondary hash code.
|
|
*/
|
|
while ( /* there is still a forward reference */
|
|
(j = forward_references[j])
|
|
&& /* its hash code does not match */
|
|
hash2(&TokenArray[j]) != h2
|
|
) {
|
|
/* continue searching */
|
|
}
|
|
/* short-circuit forward reference to it, or to zero */
|
|
forward_references[i] = j;
|
|
}
|
|
|
|
#ifdef DB_FORW_REF
|
|
db_forward_references("second hashing");
|
|
#endif /* DB_FORW_REF */
|
|
}
|
|
|
|
static int
|
|
hash2(const TOKEN *p) {
|
|
/* a simple-minded hashing for the secondary sweep;
|
|
first and last token combined in a short int
|
|
*/
|
|
return (TOKEN2int(p[0]) << 8) + TOKEN2int(p[MinRunSize-1]);
|
|
}
|
|
|
|
#ifdef DB_FORW_REF
|
|
|
|
static int hash3(const TOKEN *, const TOKEN *);
|
|
|
|
static void
|
|
make_forward_references_hash3(void) {
|
|
register unsigned int i;
|
|
|
|
/* do a third hash to check up on the previous two */
|
|
|
|
/* this time we use a genuine compare */
|
|
for (i = 0; i+MinRunSize < TextLength(); i++) {
|
|
register unsigned int j = i;
|
|
|
|
while ( /* there is still a forward reference */
|
|
(j = forward_references[j])
|
|
&& /* its hash code does not match */
|
|
!hash3(&TokenArray[i], &TokenArray[j])
|
|
) {
|
|
/* continue searching */
|
|
}
|
|
/* short-circuit forward reference to it, or to zero */
|
|
forward_references[i] = j;
|
|
}
|
|
|
|
db_forward_references("third hashing");
|
|
}
|
|
|
|
static int
|
|
hash3(const TOKEN *p, const TOKEN *q) {
|
|
/* a full comparison for the tertiary sweep */
|
|
int n;
|
|
|
|
for (n = 0; n < MinRunSize; n++) {
|
|
if (TOKEN2int(*(p+n)) != TOKEN2int(*(q+n))) return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int
|
|
db_frw_chain(int n, char *crossed_out) {
|
|
register int chain_len = -1;
|
|
/* if there are two values, the chain length is still 1 */
|
|
register int fw;
|
|
|
|
for (fw = n; fw; fw = forward_references[fw]) {
|
|
if (crossed_out[fw]) {
|
|
fprintf(DebugFile,
|
|
">>>> error in forward_references[] <<<<\n"
|
|
);
|
|
}
|
|
chain_len++;
|
|
crossed_out[fw]++;
|
|
}
|
|
fprintf(DebugFile, "n = %d, chain_len = %d\n", n, chain_len);
|
|
|
|
return chain_len;
|
|
}
|
|
|
|
static void
|
|
db_forward_references(const char *msg) {
|
|
int n;
|
|
int n_frw_chains = 0; /* number of forward ref. chains */
|
|
int tot_frwc_len = 0;
|
|
char *crossed_out;
|
|
|
|
fprintf(DebugFile, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg);
|
|
fprintf(DebugFile, "hash_table_size = %u\n", hash_table_size);
|
|
fprintf(DebugFile, "N_SAMPLES = %d\n", N_SAMPLES);
|
|
|
|
crossed_out = (char *)calloc(TextLength(), sizeof (char));
|
|
if (!crossed_out) {
|
|
fatal(">>>> no room for db_forward_references debug table <<<<\n");
|
|
}
|
|
|
|
/* Each forward_references[n] starts in principle a new
|
|
chain, and these chains never touch each other.
|
|
We check this property by marking the positions in each
|
|
chain in an array; if we meet a marked entry while
|
|
following a chain, it must have been on an earlier chain
|
|
and we have an error.
|
|
We also determine the lengths of the chains, for statistics.
|
|
*/
|
|
if (forward_references[0]) {
|
|
fprintf(DebugFile,
|
|
">>>> forward_references[0] is not zero <<<<\n"
|
|
);
|
|
}
|
|
for (n = 1; n < TextLength(); n++) {
|
|
if (forward_references[n] && !crossed_out[n]) {
|
|
/* start of a new chain */
|
|
n_frw_chains++;
|
|
tot_frwc_len += db_frw_chain(n, crossed_out);
|
|
}
|
|
}
|
|
free((char *)crossed_out);
|
|
|
|
fprintf(DebugFile,
|
|
"text length = %u, # forward chains = %d, total frw chain length = %d\n\n",
|
|
TextLength(), n_frw_chains, tot_frwc_len
|
|
);
|
|
}
|
|
|
|
#endif /* DB_FORW_REF */
|