18 năm trước cách đây · 30e4da99da
--- a/.gitattributes
+++ b/.gitattributes
@@ -8965,6 +8965,72 @@ utils/ptop.pp svneol=native#text/plain
 
				 utils/ptopu.pp svneol=native#text/plain
			
 
				 utils/rmcvsdir.pp svneol=native#text/plain
			
 
				 utils/rstconv.pp svneol=native#text/plain
			
 
				+utils/sim_pasc/Answers svneol=native#text/plain
			
 
				+utils/sim_pasc/ChangeLog svneol=native#text/plain
			
 
				+utils/sim_pasc/LICENSE.txt svneol=native#text/plain
			
 
				+utils/sim_pasc/Makefile svneol=native#text/plain
			
 
				+utils/sim_pasc/READ.ME svneol=native#text/plain
			
 
				+utils/sim_pasc/README.1st svneol=native#text/plain
			
 
				+utils/sim_pasc/READ_ME svneol=native#text/plain
			
 
				+utils/sim_pasc/TechnReport svneol=native#text/plain
			
 
				+utils/sim_pasc/add_run.c svneol=native#text/plain
			
 
				+utils/sim_pasc/add_run.h svneol=native#text/plain
			
 
				+utils/sim_pasc/aiso.bdy svneol=native#text/plain
			
 
				+utils/sim_pasc/aiso.spc svneol=native#text/plain
			
 
				+utils/sim_pasc/algollike.c svneol=native#text/plain
			
 
				+utils/sim_pasc/algollike.h svneol=native#text/plain
			
 
				+utils/sim_pasc/clang.l svneol=native#text/plain
			
 
				+utils/sim_pasc/compare.c svneol=native#text/plain
			
 
				+utils/sim_pasc/compare.h svneol=native#text/plain
			
 
				+utils/sim_pasc/debug.par svneol=native#text/plain
			
 
				+utils/sim_pasc/error.c svneol=native#text/plain
			
 
				+utils/sim_pasc/error.h svneol=native#text/plain
			
 
				+utils/sim_pasc/hash.c svneol=native#text/plain
			
 
				+utils/sim_pasc/hash.h svneol=native#text/plain
			
 
				+utils/sim_pasc/idf.c svneol=native#text/plain
			
 
				+utils/sim_pasc/idf.h svneol=native#text/plain
			
 
				+utils/sim_pasc/javalang.l svneol=native#text/plain
			
 
				+utils/sim_pasc/lang.h svneol=native#text/plain
			
 
				+utils/sim_pasc/language.h svneol=native#text/plain
			
 
				+utils/sim_pasc/lex.c svneol=native#text/plain
			
 
				+utils/sim_pasc/lex.h svneol=native#text/plain
			
 
				+utils/sim_pasc/lisplang.l svneol=native#text/plain
			
 
				+utils/sim_pasc/m2lang.l svneol=native#text/plain
			
 
				+utils/sim_pasc/miralang.l svneol=native#text/plain
			
 
				+utils/sim_pasc/options.c svneol=native#text/plain
			
 
				+utils/sim_pasc/options.h svneol=native#text/plain
			
 
				+utils/sim_pasc/pascallang.l svneol=native#text/plain
			
 
				+utils/sim_pasc/pass1.c svneol=native#text/plain
			
 
				+utils/sim_pasc/pass1.h svneol=native#text/plain
			
 
				+utils/sim_pasc/pass2.c svneol=native#text/plain
			
 
				+utils/sim_pasc/pass2.h svneol=native#text/plain
			
 
				+utils/sim_pasc/pass3.c svneol=native#text/plain
			
 
				+utils/sim_pasc/pass3.h svneol=native#text/plain
			
 
				+utils/sim_pasc/percentages.c svneol=native#text/plain
			
 
				+utils/sim_pasc/percentages.h svneol=native#text/plain
			
 
				+utils/sim_pasc/runs.c svneol=native#text/plain
			
 
				+utils/sim_pasc/runs.h svneol=native#text/plain
			
 
				+utils/sim_pasc/settings.par svneol=native#text/plain
			
 
				+utils/sim_pasc/sim.1 svneol=native#text/plain
			
 
				+utils/sim_pasc/sim.c svneol=native#text/plain
			
 
				+utils/sim_pasc/sim.h svneol=native#text/plain
			
 
				+utils/sim_pasc/sim.html svneol=native#text/plain
			
 
				+utils/sim_pasc/sim.txt svneol=native#text/plain
			
 
				+utils/sim_pasc/sortlist.bdy svneol=native#text/plain
			
 
				+utils/sim_pasc/sortlist.spc svneol=native#text/plain
			
 
				+utils/sim_pasc/stream.c svneol=native#text/plain
			
 
				+utils/sim_pasc/stream.h svneol=native#text/plain
			
 
				+utils/sim_pasc/sysidf.mk svneol=native#text/plain
			
 
				+utils/sim_pasc/sysidf.msdos svneol=native#text/plain
			
 
				+utils/sim_pasc/sysidf.unix svneol=native#text/plain
			
 
				+utils/sim_pasc/system.par svneol=native#text/plain
			
 
				+utils/sim_pasc/text.c svneol=native#text/plain
			
 
				+utils/sim_pasc/text.h svneol=native#text/plain
			
 
				+utils/sim_pasc/textlang.l svneol=native#text/plain
			
 
				+utils/sim_pasc/token.c svneol=native#text/plain
			
 
				+utils/sim_pasc/token.h svneol=native#text/plain
			
 
				+utils/sim_pasc/tokenarray.c svneol=native#text/plain
			
 
				+utils/sim_pasc/tokenarray.h svneol=native#text/plain
			
 
				 utils/simulator/Makefile svneol=native#text/plain
			
 
				 utils/simulator/Makefile.fpc svneol=native#text/plain
			
 
				 utils/simulator/alphasim.pas svneol=native#text/plain
			
--- a/utils/sim_pasc/Answers
+++ b/utils/sim_pasc/Answers
@@ -0,0 +1,57 @@
 
				+		The software and text similarity tester SIM
			
 
				+
			
 
				+SIM tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp,
			
 
				+Miranda, and natural language.  It is used
			
 
				+
			
 
				+- to detect potentially duplicated code fragments in large software projects,
			
 
				+	in program text but also in shell scripts and documentation;
			
 
				+- to detect plagiarism in software projects, educational and otherwise.
			
 
				+
			
 
				+SIM is available through ftp.  The directory
			
 
				+
			
 
				+	ftp.cs.vu.nl:pub/dick/similarity_tester
			
 
				+
			
 
				+contains the sources (in C) and the MSDOS .EXEs.
			
 
				+
			
 
				+The software similarity tester is very efficient and allows us to compare
			
 
				+this year's students' work with that collected from many past years (much to
			
 
				+the dismay of some, mostly non-CS, students).  Students are told in advance
			
 
				+that their work is going to be compared, but some are non-believers ...
			
 
				+
			
 
				+The output of the similarity tester can be processed by a number of shell
			
 
				+scripts by Matty Huntjens.  These shell scripts take sim output and produce
			
 
				+lists of suspect submissions, histograms and the like.
			
 
				+The present version of these scripts is very much geared to the local situation
			
 
				+at the Vrije Universiteit, though; they are low on portability.
			
 
				+Matty Huntjens' email address is [email protected].
			
 
				+
			
 
				+We are not afraid that students would try to tune their work to the
			
 
				+similarity tester.  We reckon if they can do that they can also do the
			
 
				+exercise.
			
 
				+
			
 
				+Since this piece of handicraft does not qualify as research, there are no
			
 
				+international papers on it.  A paper, titled `Detecting copied submissions in
			
 
				+computer science lab work', was published in a local (i.e. Dutch) computer
			
 
				+science journal:
			
 
				+
			
 
				+%A Dick Grune
			
 
				+%A Matty Huntjens
			
 
				+%T Het detecteren van kopie\(:en bij informatica-practica
			
 
				+%J Informatie (in Dutch)
			
 
				+%V 31
			
 
				+%N 11
			
 
				+%D Nov 1989
			
 
				+%P 864-867
			
 
				+
			
 
				+The ftp directory contains a terse technical report about the internal
			
 
				+working of the program.
			
 
				+
			
 
				+					Dick Grune
			
 
				+					Vrije Universiteit
			
 
				+					de Boelelaan 1081
			
 
				+					1081 HV  Amsterdam
			
 
				+					the Netherlands
			
 
				+					[email protected]
			
 
				+					+31 20 444 7744
			
 
				+----------------------------------------------------------------
			
 
				+With infinitely many exceptions, what you do makes no difference.
			
--- a/utils/sim_pasc/ChangeLog
+++ b/utils/sim_pasc/ChangeLog
@@ -0,0 +1,580 @@
 
				+2007-08-23  Dick Grune  <[email protected]>
			
 
				+	LICENSE.txt added.
			
 
				+
			
 
				+2006-11-27  Dick Grune  <[email protected]>
			
 
				+	Removal of setbuff() for compatibility.
			
 
				+
			
 
				+2005-01-17  Dick Grune  <[email protected]>
			
 
				+	Corrections by Jerry James <[email protected]>; ANSIizing, etc.
			
 
				+
			
 
				+2004-08-05  Dick Grune  <[email protected]>
			
 
				+	Finished the 'percentage' option.
			
 
				+
			
 
				+08-Nov-2001	Dick Grune
			
 
				+	Begun to add a 'percentage' option, which will express the
			
 
				+	similarity between two files in percents.
			
 
				+
			
 
				+27-Sep-2001	Dick Grune
			
 
				+	Split add_run() off from compare.c into add_run.c, to accomodate
			
 
				+	different add_run()s, for different types of processing.
			
 
				+
			
 
				+27-Nov-1998	Dick Grune
			
 
				+	Installed a Miranda version supplied by Emma Norling ([email protected])
			
 
				+
			
 
				+23-Feb-1998	Dick Grune
			
 
				+	Renamed text.l to textlang.l for uniformity and to make room for
			
 
				+	a possible module text.[ch].
			
 
				+
			
 
				+	Isolated a module for handling the token array from buff.[ch] to
			
 
				+	tokenarray.[ch], and renamed buff.[ch] to text.[ch].
			
 
				+
			
 
				+23-Feb-1998	Dick Grune
			
 
				+	There is probably not much point in abandoning the nl_buff list
			
 
				+	when running out of memory for TokenArray[]: each token costs 1
			
 
				+	byte for the token and 4 bytes for the entry in
			
 
				+	forward_references[], a total of 5 bytes.  There are about 3
			
 
				+	tokens to a line, together requiring 15 bytes, plus 1 byte in
			
 
				+	nl_buff yields 16 bytes.  So releasing nl_buff frees only 1/16 =
			
 
				+	6.7 % of memeory.
			
 
				+
			
 
				+	Since the code is a bother, I removed it.  Note that nl_buff is
			
 
				+	still abandoned when the number of tokens in a line does not fit
			
 
				+	in one unsigned char (but that is not very likely to happen).
			
 
				+
			
 
				+	
			
 
				+21-Feb-1998	Dick Grune
			
 
				+	Printing got into an infinite loop when the last line of the
			
 
				+	input was not terminated by a newline AND contained tokens that
			
 
				+	were included in a matching run.
			
 
				+	This was due to a double bug: 1. the non-terminated line was not
			
 
				+	registered properly in NextTextTokenObtained() / CloseText(),
			
 
				+	and 2. the loop in pass 2 which sets the values of
			
 
				+	pos->ps_nl_cnt was terminated prematurely when the file turned
			
 
				+	out to be shorter than the list of pos-es indicated.
			
 
				+	Both bugs were corrected, the first by supplying an extra
			
 
				+	newline in CloseText() when one is found missing, and the second
			
 
				+	by rewriting the list-parallel loop in pass 2.
			
 
				+
			
 
				+02-Feb-1998	Dick Grune
			
 
				+	Pascal does not differentiate between strings and characters
			
 
				+	(strings of one character); this difference has been removed
			
 
				+	from pascallang.l.
			
 
				+
			
 
				+22-Jan-1998	Dick Grune
			
 
				+	Detection of non-ASCII characters added.  Since the lexical
			
 
				+	analyser itself generates non-ASCII characters, the test must occur
			
 
				+	earlier.  We could replace the input routine of lex by a
			
 
				+	checking routine, but with several lex-es going around, we want
			
 
				+	a more lex-independent solution.  To allow each language its own
			
 
				+	restrictions about non-ASCII characters, the check is
			
 
				+	implemented in the *lang.l files.
			
 
				+
			
 
				+28-Nov-1997	Dick Grune
			
 
				+	Changed the name of the C similarity tester 'sim' to 'sim_c', for
			
 
				+	uniformity with sim_java, etc.
			
 
				+
			
 
				+23-Nov-1997	Dick Grune
			
 
				+	Java version finished; checked by Matty Huntjens and crew.
			
 
				+
			
 
				+24-Jun-1997	Dick Grune
			
 
				+	Started on a Java version, by copying the C version.
			
 
				+
			
 
				+22-Jun-1997	Dick Grune
			
 
				+	Modern lexical analysers, among which flex, read the entire input into
			
 
				+	a buffer before they issue the first token.  As a result, ftell() no
			
 
				+	longer gives a usable indication of the position of a token in a file.
			
 
				+	This pulls the rug from under the nl_buff mechanism in buff.c, which
			
 
				+	is removed.  We loose a valuable optimization this way, but there just
			
 
				+	seems to be no way to keep it.
			
 
				+
			
 
				+	Note that this has nothing to do with the problem in MS-DOS of
			
 
				+	character count and fseek position not being synchronized.  That
			
 
				+	problem has been solved on June 14, 1991 (which see) and the code has
			
 
				+	been running OK since.
			
 
				+
			
 
				+18-Jun-1997	Dick Grune
			
 
				+	The thought has occurred to use McCreight's linear longest common
			
 
				+	substring algorithm rather than the existing algorithm, which has a
			
 
				+	small quadratic component.  There are a couple of problems with this:
			
 
				+	1.	We need the longest >non-overlapping< common substring;
			
 
				+		McCreight provides just the longest.  It is not at all clear
			
 
				+		how to modify the algorithm.
			
 
				+	2.	Once we have found our LCS, we want to find the
			
 
				+		one-but-longest; it is far from obvious how to do that in
			
 
				+		McCreight's algorithm.
			
 
				+	3.	Once we have found our LCS, we want to take one of its
			
 
				+		copies out of the game, to suppress duplicate messages.
			
 
				+		Again, it is difficult to see how to do that, without
			
 
				+		redoing all the calculations.
			
 
				+	4.	McCreight's algorithm seems to require about two binary
			
 
				+		tree nodes per token, say 8 bytes, which is double we
			
 
				+		use now.
			
 
				+
			
 
				+17-Jun-1997	Dick Grune
			
 
				+	Did some experimenting with the hash function; it is still
			
 
				+	pretty bad: the simple-minded second sweep through
			
 
				+	forward_references easily removes another 80-99% of false hits.
			
 
				+	Next, a third sweep that does a full comparison will remove another
			
 
				+	large percentage.
			
 
				+	
			
 
				+	So I have left in the second sweep in all cases.
			
 
				+	
			
 
				+	There are a couple of questions here:
			
 
				+	1. Can we find a better hash function, or will we forever need a
			
 
				+		second sweep?
			
 
				+	2. Does it actually matter, or will we loose on more expensive
			
 
				+		hashing what we gain by having a better set of forward
			
 
				+		references in compare.c?
			
 
				+
			
 
				+
			
 
				+16-Jun-1997	Dick Grune
			
 
				+	Cleaned up sim.h and renamed aiso.[ch] to runs.[ch] since they
			
 
				+	are instantiations of the aiso module concerned with runs.
			
 
				+	Aiso.[spc|bdy] stays aiso.[spc|bdy], of course.
			
 
				+
			
 
				+16-Jun-1997	Dick Grune
			
 
				+	Redid largest_function() in algollike.c.
			
 
				+	Corrected bug in CheckRun; it now always removes NonFinals from
			
 
				+	the end, even when it has first applied largest_function().
			
 
				+
			
 
				+15-Jun-1997	Dick Grune
			
 
				+	Reorganized the layers around the input file.  There were and
			
 
				+	still are three layers: lang, stream and buff.
			
 
				+
			
 
				+	Since the lex_X variables are hoisted unchanged through the levels
			
 
				+	lang, stream, and buff, to be used by pass1, pass2, etc., they
			
 
				+	have to be placed in a module of their own.
			
 
				+
			
 
				+	The token-providing module 'lang' has three interfaces:
			
 
				+	-	lang.h, which provides access to the lowest-level token
			
 
				+			routines, to be used by the next level.
			
 
				+	-	lex.h, which provides the lex variables, to be used by
			
 
				+			all and sundry.
			
 
				+	-	language.h, which provides language-specific info about
			
 
				+			tokens, concerning their suitability as initial
			
 
				+			and final tokens, to be used by higher levels.
			
 
				+			
			
 
				+	This structure is not satisfactory, but it is also unreasonable
			
 
				+	to combine them in one interface.
			
 
				+
			
 
				+	There is no single lang.c; rather it is represented by the
			
 
				+	various Xlang.c files generated from the Xlang.l files.
			
 
				+
			
 
				+14-Jun-1997	Dick Grune
			
 
				+	Added a Makefile zip entry to parallel the shar entry.
			
 
				+
			
 
				+13-Jun-1997	Dick Grune
			
 
				+	A number of simplifications, in view of better software and bigger
			
 
				+	machines:
			
 
				+	-	Removed good_realloc from hash.c; I don't think there are
			
 
				+		any bad reallocs left.
			
 
				+	-	Removed the option to run without forward_references.
			
 
				+		On a 16Mb machine this means you have at least 2M tokens;
			
 
				+		using a quadratic algorithm will take 4*10^6 sec. at an
			
 
				+		impossible rate of 1M actions/sec., which is some 50 days.
			
 
				+		Forget it.
			
 
				+	-	Renamed lang() to print_stream(), and incorporated it in sim.c
			
 
				+	-	Removed the MSDOS subdirectory mechanism in the Makefile.
			
 
				+	-	Removed the funny and sneaky double parameter expansion in
			
 
				+		the call of idf_in_list().
			
 
				+
			
 
				+12-Jun-1997	Dick Grune
			
 
				+	Converted to ANSI C.  Removed cport.h.
			
 
				+
			
 
				+09-Jan-1995	Dick Grune
			
 
				+	Decided not to do directories: they usually contain extraneous
			
 
				+	files and doing sim * is simple enough anyway.
			
 
				+
			
 
				+09-Sep-1994	Dick Grune
			
 
				+	Added system.h to cater for the (few) differences between Unix and DOS.
			
 
				+	The #define int32 is also supplied there.
			
 
				+
			
 
				+05-Sep-1994	Dick Grune
			
 
				+	Added many prototype declarations using cport.h.
			
 
				+	Added a depend entry to the Makefile.
			
 
				+
			
 
				+31-Aug-1994	Dick Grune
			
 
				+	All these changes require a 32 bit integer; introduced a #define
			
 
				+	int32, set from the command line in the Makefile.
			
 
				+
			
 
				+25-Aug-1994	Dick Grune
			
 
				+	It turned out that one of the most often called routines was .rem,
			
 
				+	from idf_hashed() in idf.c.  Moving the % out of the loop chafed off
			
 
				+	another 6% and reduced the time to 18.4 sec.
			
 
				+
			
 
				+19-Aug-1994	Dick Grune
			
 
				+	With very large files (e.g., concatenated /usr/man/man1/*) the fixed
			
 
				+	built-in hash table size of 10639 is no longer satisfactory.  Hash.c
			
 
				+	now finds a prime about 8 times smaller than the text_size to use
			
 
				+	for hash table size; this achieves optimal speed-up without gobbling
			
 
				+	up too much memory.  Reduced the time for the above file from 30.2
			
 
				+	sec. to 19.6 sec.
			
 
				+	For checking, the same test was run with all hashing off; it took
			
 
				+	20h 27m 19s = 73639 sec.  But it worked.
			
 
				+
			
 
				+11-Aug-1994	Dick Grune
			
 
				+	For large values of MinRunSize (>1000) a large part of the time
			
 
				+	(>two-thirds) was spent in calculating the hash values for each
			
 
				+	position in the input, since the cost of this calculation was
			
 
				+	proportional to MinRunSize.  We now sample a maximum of 24 tokens
			
 
				+	from the input string to calculate the hash value, and avoid
			
 
				+	overflow.  On my workstation, this reduces the time for
			
 
				+		sim_text -r 1000 -n /usr/man/man1/*
			
 
				+	from 60 sec to 21 sec.
			
 
				+
			
 
				+30-Jun-1992	Dick Grune,kamer R4.40,telef. 5778
			
 
				+	There was an amazing bug in buff.c where NextTextToken() for pass 2
			
 
				+	omitted to set lex_token to EOL when retrieving newline info from
			
 
				+	nl_buff. Worked until now!?!
			
 
				+
			
 
				+23-Sep-1991	Dick Grune
			
 
				+	Cport.h introduced, CONST and *.spc only.
			
 
				+
			
 
				+17-Sep-1991	Dick Grune
			
 
				+	The position-sorting routine in pass2.c has been made into a
			
 
				+	separate generic module.
			
 
				+
			
 
				+14-Jun-1991	Dick Grune ([email protected]) at dick.cs.vu.nl
			
 
				+	Replaced the determination of the input position through counting
			
 
				+	input characters by calls of ftell(); this is cleaner and the other
			
 
				+	method will never work on MSDOS.
			
 
				+
			
 
				+30-May-1989	Dick Grune (dick) at dick
			
 
				+	Replaced the old top-100 module (which had been extended to top-10000
			
 
				+	already anyway) by the new aiso (arbitrary-in sorted-out) module.
			
 
				+	This caused a considerable speed-up on the Mod2 test bed:
			
 
				+		 %time  cumsecs  #call  ms/call  name
			
 
				+		  17.9    99.20   7209    13.76  _InsertTop
			
 
				+		   0.3     1.37   7209     0.19  _InsertAiso
			
 
				+	It turns out that malloc() is not a serious problem, so no special
			
 
				+	version for the aiso module is required.
			
 
				+
			
 
				+23-May-1989	Dick Grune (dick) at dick
			
 
				+	No more uncommented comment at the end of preprocessor lines, to
			
 
				+	conform to ANSI C.
			
 
				+
			
 
				+23-May-1989	Dick Grune (dick) at dick
			
 
				+	Added code in the X.l files to (silently) reject characters over 0200.
			
 
				+	This does not really help, since lex stops on null chars. Ah, well.
			
 
				+
			
 
				+19-May-1989	Dick Grune (dick) at dick
			
 
				+	Made the token as handled by sim into an abstract data type, for
			
 
				+	aesthetic reasons. Sign extension is still a problem.
			
 
				+
			
 
				+03-May-1989	Dick Grune (dick) at dick
			
 
				+	Optimized lcs() by first checking from the end if a sufficiently long
			
 
				+	run is present; if in fact only the first 12 tokens match, chances
			
 
				+	are good that you can reject the run right away by first testing
			
 
				+	the 20th token, then the 19th, and so on.
			
 
				+
			
 
				+21-Apr-1989	Dick Grune (dick) at dick
			
 
				+	A run of sim_m2 finding 7209 similarities raised the question of
			
 
				+	the appropriateness of the linear sort in sort_pos(). Profiling
			
 
				+	showed that in this case sorting takes all of 7.5 % of the total
			
 
				+	time. Putting the word register in in the right places in
			
 
				+	sort_pos() lowered this number to 4.6%.
			
 
				+
			
 
				+20-Apr-1989	Dick Grune (dick) at dick
			
 
				+	Moved the test for MayBeStartOfRun() from compare.c (where it is
			
 
				+	done again and again) to hash.c, where its effect is incorporated in
			
 
				+	the forward reference chain.
			
 
				+
			
 
				+14-Apr-1989	Dick Grune (dick) at dick
			
 
				+	Replaced elem_of() by bit tables, headers[] and trailers[], to be
			
 
				+	prefilled from Headers[] and Trailers[] by a call of
			
 
				+	InitLanguage(). This saves a few percents.
			
 
				+
			
 
				+13-Apr-1989	Dick Grune (dick) at dick
			
 
				+	Implemented the -e and the -S option, by putting yet another loop
			
 
				+	in compare.c
			
 
				+
			
 
				+13-Apr-1989	Dick Grune (dick) at dick
			
 
				+	The -- option (displaying the tokens) will now handle more than one
			
 
				+	file.
			
 
				+
			
 
				+20-Jan-1989	Dick Grune (dick) at dick
			
 
				+	After the modification of 19-Dec-88, 12% of the time went into
			
 
				+	updating the positions in the chunks, as they were produced by the
			
 
				+	matching process. This matching process identifies runs (matches)
			
 
				+	by token position, which has to be recalculated to lseek positions
			
 
				+	and line numbers. To this end the files are read again, and for
			
 
				+	each line all positions found were checked to see if they applied
			
 
				+	to this line; this was a awfully stupid algorithm, but since much
			
 
				+	more time was spent elsewhere, it did not really matter. With all
			
 
				+	the saving below, however, it had risen to second position, after
			
 
				+	yylook() with 35%.
			
 
				+
			
 
				+	Th solution was, to sort the positions in the same order in which
			
 
				+	they would be met by the reading of the files. The process is then
			
 
				+	linear. This required some extensive hacking in pass2.c
			
 
				+
			
 
				+06-Jan-1989	Dick Grune (dick) at dick
			
 
				+	The modification below did indeed save 25%. The newline information
			
 
				+	is now reduced to 2 shorts; 2 chars were not enough, since some
			
 
				+	lines are longer that 127 bytes, and a char and a short together
			
 
				+	take as much room as two shorts.
			
 
				+
			
 
				+19-Dec-1988	Dick Grune (dick) at dick
			
 
				+	To avoid reading the files twice (which is still taking 25% of the
			
 
				+	time), the first pass will now collect newline information for the
			
 
				+	second pass in a buffer called nl_buff[].  This buffer, and the
			
 
				+	original token buffer now named TokenArray[], are managed by the file
			
 
				+	buff.c, which implements a layer between stream.h and pass?.c. This
			
 
				+	layer provides OpenText(), NextTextToken() and CloseText(), each
			
 
				+	with a parameter telling which pass it is.
			
 
				+
			
 
				+06-Dec-1988	Dick Grune (dick) at dick
			
 
				+	As an introduction to removing the second pass altogether, the
			
 
				+	first and second scan were unified, i.e., their input is identical.
			
 
				+	This also means that the call sim -[12] has now been replaced by
			
 
				+	one call:  sim --.
			
 
				+
			
 
				+23-Sep-1988	Dick Grune (dick) at dick
			
 
				+	Dynamic allocation of line buffers in pass 3.  This removes the
			
 
				+	restriction on the page width.
			
 
				+
			
 
				+22-Sep-1988	Dick Grune (dick) at dick
			
 
				+	In order to give better messages on incorrect calls to sim, the
			
 
				+	whole option handling has been concentrated in a file option.c and
			
 
				+	separated from the options and their messages themselves. See sim.c
			
 
				+
			
 
				+07-Sep-1988	Dick Grune (dick) at dick
			
 
				+	For long text sequences (say hundreds of thousands of tokens),
			
 
				+	the hashing is not really efficient any more since too many
			
 
				+	spurious matches occur.  Therefore, the forward reference table is
			
 
				+	scanned a second time, eliminating from any chain all references to
			
 
				+	runs that do not end in the same token.  For the UNIX manuals this
			
 
				+	reduced the number of matches from 91.9% to 1.9% (of which 0.06%
			
 
				+	were genuine).
			
 
				+
			
 
				+30-Aug-1988	Dick Grune (dick) at dick
			
 
				+	For compatibility, NextTop has been rewritten to yield true or
			
 
				+	false and to accept a pointer to a run as a parameter.
			
 
				+
			
 
				+30-Aug-1988	Dick Grune (dick) at dick
			
 
				+	When trying to find line-number and lseek position to beginnings
			
 
				+	and ends of runs found, the whole set of runs was scanned for each
			
 
				+	line in each file.  Now only the runs belonging to that file are
			
 
				+	scanned; to this end another linked list has been braided through
			
 
				+	the data structures (tx_chunk).
			
 
				+
			
 
				+30-Aug-1988	Dick Grune (dick) at dick
			
 
				+	The longest-common-substring algorithm was called much too often,
			
 
				+	mainly because the forward references made by hashing suffered from
			
 
				+	pollution.  If you have say 1000 tokens and a hash range of say
			
 
				+	10000, about 5 % of the hashings will be false matches, i.e. 50
			
 
				+	matches, which is quite a lot on a natural number of 2 to 3 matches.
			
 
				+	Improved by doing a second check in make_forw_ref().
			
 
				+
			
 
				+12-Jun-1988	Dick Grune (dick) at dick
			
 
				+	Installed a Lisp version supplied by Gertjan Akkerman.
			
 
				+
			
 
				+15-Jan-1988	Dick Grune (dick) at dick
			
 
				+	Added register declarations all over the place.
			
 
				+
			
 
				+14-Jan-1988	Dick Grune (dick) at dick
			
 
				+	It is often useful to match a piece of code exactly, especially
			
 
				+	when function names (or, even more so, macro names) are involved.
			
 
				+	What one would want is having all the letters in the text array,
			
 
				+	but this is kind of hard, since each entry is one lexical item.
			
 
				+	This means that under the -F option each letter is a lex item, and
			
 
				+	normally each tag is a lex item; this requires two lex grammars in
			
 
				+	one program; no good.  So, on the -F flag we hash the identifier
			
 
				+	into one lex item, which is hopefully characteristic enough.  It
			
 
				+	works.
			
 
				+
			
 
				+30-Sep-1987	Dick Grune (dick) at dick
			
 
				+	Some cosmetics.
			
 
				+
			
 
				+31-Aug-1987	Dick Grune (dick) at dick
			
 
				+	Moved the whole thing to the SUN (while testing on a VAX and a
			
 
				+	MC68000)
			
 
				+
			
 
				+16-Aug-1987	Dick Grune (dick) at dick
			
 
				+	The test program lang.c is no longer a main program, but rather a
			
 
				+	subroutine called in main() in sim.c, through the command line
			
 
				+	option -1 or -2.
			
 
				+
			
 
				+23-Apr-1987	Dick Grune (dick) at tjalk
			
 
				+	Changed the name 'index' into 'elem_of', because of compatibility
			
 
				+	problems on different Unices. Added a declaration for it in
			
 
				+	the file algollike.c
			
 
				+
			
 
				+10-Mar-1987	Dick Grune (dick) at tjalk
			
 
				+	Changed the printing of the header of a run so that:
			
 
				+	-	long file names will no longer be truncated
			
 
				+	-	the run length is displayed
			
 
				+
			
 
				+27-Jan-1987	Dick Grune (dick) at tjalk
			
 
				+	Switched it right off again!  Getting them in textual order is
			
 
				+	still more unpleasant, since now you cannot find the important
			
 
				+	ones if their are more than a few runs.
			
 
				+
			
 
				+27-Jan-1987	Dick Grune (dick) at tjalk
			
 
				+	Going to experiment with leaving out the sorting; just all the
			
 
				+	runs, in the order we meet them.  Should be as good or better.
			
 
				+	Comparisons of more than 100 runs are very rare anyway, so the
			
 
				+	fact that those over a 100 are rejected is probably no great
			
 
				+	help.  Getting them in a funny order is a nuisance, however.  Down
			
 
				+	with featurism.  Just to be safe, present version saved as
			
 
				+	870127.SV
			
 
				+
			
 
				+26-Dec-1986	Dick Grune (dick) at tjalk
			
 
				+	Names of overall parameters in params.h changed to more uniformity.
			
 
				+
			
 
				+26-Dec-1986	Dick Grune (dick) at tjalk
			
 
				+	Since the top package and the instantiation system have grown
			
 
				+	apart so much, I have integrated the old top package into sim,
			
 
				+	i.e., done the instantiation by hand.  This removes top.g and
			
 
				+	top.p, and will save outsiders from wondering what is going on
			
 
				+	here.
			
 
				+
			
 
				+23-Dec-1986	Dick Grune (dick) at tjalk
			
 
				+	Use setbuf to print unbuffered while reading the files (lex core
			
 
				+	dumps, other mishaps) and print buffered while printing the real
			
 
				+	output (for speed).
			
 
				+
			
 
				+30-Nov-1986	Dick Grune (dick) at tjalk
			
 
				+	Various small changes in *lang.l:
			
 
				+		; ignored conditionally (!options['f'])
			
 
				+		new format for tokens in struct idf
			
 
				+		cosmetics: macro Layout, macro UnsafeComChar, no \n
			
 
				+			in character denotations, more than one char
			
 
				+			in a char denotations in Pascal, etc.
			
 
				+
			
 
				+30-Nov-1986	Dick Grune (dick) at tjalk
			
 
				+	Added a Modula-2 version.
			
 
				+
			
 
				+29-Nov-1986	Dick Grune (dick) at tjalk
			
 
				+	Restricting tokens to the ASCII95 character set is really too
			
 
				+	severe: some languages have many more reserved words (COBOL!).
			
 
				+	Corrected this by adding a couple of '&0377' in strategic places.
			
 
				+	Added a routine for printing the 8-bit beasties: show_token().
			
 
				+
			
 
				+15-Aug-1986	Dick Grune (dick) at tjalk
			
 
				+	Since the ; is superfluous in both C and Pascal, it is now ignored
			
 
				+	by clang.l and pascallang.l
			
 
				+
			
 
				+15-Aug-1986	Dick Grune (dick) at tjalk
			
 
				+	The code in CheckRun in Xlang.l was incorrect in that it used the
			
 
				+	wrong criterion for throwing away trailing garbage. I've taken
			
 
				+	CheckRun etc. out of the Xlang.l-s and turned them into a module
			
 
				+	"algollike.c".  Made a cleaner interface and avoided duplication of
			
 
				+	code.
			
 
				+
			
 
				+02-Jul-1986	Dick Grune (dick) at tjalk
			
 
				+	Looking backwards in compare.c to see if we are in the middle of a
			
 
				+	run is an atavism. You can be and still be all right, e.g., if
			
 
				+	part of the run was rejected as not fitting for a function.
			
 
				+	Removed from compare.c.
			
 
				+
			
 
				+10-Jun-1986	Dick Grune (dick) at tjalk
			
 
				+	The function hash_code() in hash.c could yield a negative value;
			
 
				+	corrected.
			
 
				+
			
 
				+09-Jun-1986	Dick Grune (dick) at tjalk
			
 
				+	Changed the name of the file text.h to sim.h.  Sim.h is more
			
 
				+	appropriate and text.h sounds as if it belongs to text.l, with
			
 
				+	which it has no connection.
			
 
				+
			
 
				+04-Jun-1986	Dick Grune (dick) at tjalk
			
 
				+	After having looked at a couple of hash functions and having done
			
 
				+	some calculations on the number of duplicates normally encountered
			
 
				+	in hash functions, I conclude that our function in hash.c is quite
			
 
				+	good.  Removed all the statistics-gathering stuff.
			
 
				+	
			
 
				+	Actually, hash_table[] is not the hash table at all; it is a
			
 
				+	forward reference table; likewise, the real hash table was called
			
 
				+	last[].  Renamed both.
			
 
				+	
			
 
				+	There is a way to keep the hash table local without putting it on
			
 
				+	the stack: use malloc().
			
 
				+
			
 
				+02-Jun-1986	Dick Grune (dick) at tjalk
			
 
				+	Added a simple lex file for text: each word is condensed into a
			
 
				+	hash code which is mapped on the ASCII95 character set.  This
			
 
				+	turns out to be quite effective.
			
 
				+
			
 
				+01-Jun-1986	Dick Grune (dick) at tjalk
			
 
				+	The macros cput(tk) and c_eol() both have a return in them, so any
			
 
				+	code after them may not be executed -> they have to be last in an
			
 
				+	entry.  But they weren't, in many places; I can't imagine why it
			
 
				+	all worked nevertheless.  They have been renamed return_tk(tk) and
			
 
				+	return_eol() and the entries have been restructured.
			
 
				+
			
 
				+30-May-1986	Dick Grune (dick) at tjalk
			
 
				+	Moved the string and character entries in clang.l and pascallang.l
			
 
				+	to a place behind the comment entries, to avoid strings (and
			
 
				+	characters) being recognized inside comments.  I first thought
			
 
				+	this would not happen, but as Maarten pointed out, if both
			
 
				+	interpretations have the same length, lex will take the first
			
 
				+	entry. Now this will happen if the string occupies the whole line
			
 
				+	that would otherwise be taken as a comment.  In short,
			
 
				+	/*
			
 
				+	"hallo"
			
 
				+	*/
			
 
				+	would return ".
			
 
				+
			
 
				+28-May-1986	Dick Grune (dick) at tjalk
			
 
				+	Added -d option, to display the output in diff(1) format (courtesy
			
 
				+	of Maarten van der Meulen).
			
 
				+	Rewrote the lexical parsing of comments (likewise courtesy Maarten
			
 
				+	van der Meulen).
			
 
				+
			
 
				+20-May-1986	Dick Grune (dick) at tjalk
			
 
				+	Added a routine to convert identifiers to lower case in
			
 
				+	pascallang.l .
			
 
				+
			
 
				+19-May-1986	Dick Grune (dick) at tjalk
			
 
				+	Added -a option, to quickly check antecedent of a file (courtesy
			
 
				+	of Maarten van der Meulen).
			
 
				+
			
 
				+18-May-1986	Dick Grune (dick) at tjalk
			
 
				+	Brought everything under RCS/CVS.
			
 
				+
			
 
				+18-Mar-1986	Dick Grune (dick) at tjalk
			
 
				+	Added modifications by Paul Bame (hp-lsd!paul@hp-labs) to have an
			
 
				+	option -w to set the page width.
			
 
				+
			
 
				+21-Feb-1986	Dick Grune (dick) at tjalk
			
 
				+	Took array last[N_HASH] out of make_hash() in hash.c, due to stack
			
 
				+	overflow on the Gould (reported by George Walker
			
 
				+	[email protected])
			
 
				+
			
 
				+16-Feb-1986	Dick Grune (dick) at tjalk
			
 
				+	Corrected some subtractions that caused unsigned ints to turn
			
 
				+	pseudo-negative. (Reported by jaap@mcvax)
			
 
				+
			
 
				+11-Jan-1986	Dick Grune (dick) at tjalk
			
 
				+	Touched up for distribution.
			
 
				+
			
 
				+10-Jan-1986	Dick Grune (dick) at tjalk
			
 
				+	Fill_line was not called for empty lines, which caused them to be
			
 
				+	printed as repetitions of the previous line.
			
 
				+
			
 
				+24-Dec-1985	Dick Grune (dick) at tjalk
			
 
				+	Reduced hash table to a single array of indices; it is used only
			
 
				+	in one place, which makes it very easy to make it (the hash table)
			
 
				+	optional.  General tune-up of everything.  This seems to be
			
 
				+	another stable "final" version.
			
 
				+
			
 
				+14-Dec-1985	Dick Grune (dick) at tjalk
			
 
				+	Some experiments with hash formulas:
			
 
				+	h = (h OP CST) + *p++ OP CST yields	right	wrong
			
 
				+		* 96		- 32		205	562
			
 
				+		* 96		- 2		205	560
			
 
				+		* 96				205	560
			
 
				+		* 97				205	559
			
 
				+		<< 0				 66	3128
			
 
				+		<< 1				203	555
			
 
				+		<< 2				205	536
			
 
				+		<< 7				203	540
			
 
				+	Conclusion: it doesn't matter, unless you do it wrong.
			
 
				+
			
 
				+01-Oct-1983	Dic8k Grune (dick) at vu44
			
 
				+	Oldest known files.
			
 
				+
			
 
				+#	This file is part of the software similarity tester SIM.
			
 
				+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+#	$Id: ChangeLog,v 2.12 2007/08/27 09:57:30 dick Exp $
			
 
				+#
			
--- a/utils/sim_pasc/LICENSE.txt
+++ b/utils/sim_pasc/LICENSE.txt
@@ -0,0 +1,31 @@
 
				+Copyright (c) 1986, 2007, Dick Grune, Vrije Universiteit, The Netherlands
			
 
				+All rights reserved.
			
 
				+
			
 
				+Redistribution and use in source and binary forms,
			
 
				+with or without modification, are permitted provided
			
 
				+that the following conditions are met:
			
 
				+
			
 
				+   * Redistributions of source code must retain the above copyright
			
 
				+     notice, this list of conditions and the following disclaimer.
			
 
				+
			
 
				+   * Redistributions in binary form must reproduce the above
			
 
				+     copyright notice, this list of conditions and the following
			
 
				+     disclaimer in the documentation and/or other materials provided
			
 
				+     with the distribution.
			
 
				+
			
 
				+   * Neither the name of the Vrije Universiteit nor the names of its
			
 
				+     contributors may be used to endorse or promote products derived
			
 
				+     from this software without specific prior written permission.
			
 
				+
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
			
 
				+NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
			
 
				+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
			
 
				+IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
			
 
				+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
			
 
				+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
			
 
				+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
			
 
				+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
			
 
				+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
			
 
				+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
			
 
				+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
--- a/utils/sim_pasc/Makefile
+++ b/utils/sim_pasc/Makefile
@@ -0,0 +1,566 @@
 
				+#	This file is part of the software similarity tester SIM.
			
 
				+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+#	$Id: Makefile,v 2.17 2007/08/27 09:57:31 dick Exp $
			
 
				+#
			
 
				+
			
 
				+
			
 
				+#	E N T R Y   P O I N T S
			
 
				+test_sim:
			
 
				+
			
 
				+help:
			
 
				+	@echo 'Entry points:'
			
 
				+	@echo 'test_sim:	compile sim_c and run a simple test (default)'
			
 
				+	@echo ''
			
 
				+	@echo 'all:		create all binaries'
			
 
				+	@echo 'sim_X(.exe):	create specific binary for language X'
			
 
				+	@echo 'install_all:	install all binaries'
			
 
				+	@echo 'install.sim_X:	install specific binary for language X'
			
 
				+	@echo 'where X is one of c, java, pasc, m2, lisp, mira, text'
			
 
				+	@echo ''
			
 
				+	@echo 'lint:		lint sim_c sources'
			
 
				+	@echo 'lint.all:	lint all sim sources'
			
 
				+	@echo 'simsim:		run sim on the sim sources'
			
 
				+	@echo ''
			
 
				+	@echo 'simsrc.shr:	create sources shar file'
			
 
				+	@echo 'simsrc.zip:	create sources zip file'
			
 
				+	@echo 'depend:		update dependencies in Makefile'
			
 
				+	@echo 'clean:		remove created files'
			
 
				+	@echo ''
			
 
				+	@echo 'sim_exes:	create DOS executables in MSDOS; set date; make clean'
			
 
				+	@echo 'simexe.zip:	create DOS executables package in UNIX'
			
 
				+	@echo 'VERSION=2.X install_ftp:	install in the FTP directory in UNIX'
			
 
				+
			
 
				+VERSION =	2_21
			
 
				+
			
 
				+#
			
 
				+# When you modify any of the following flags, do 'make clean'
			
 
				+#
			
 
				+
			
 
				+include		sysidf.mk
			
 
				+
			
 
				+
			
 
				+# Flags
			
 
				+OPTLEVEL =	-O4#			#
			
 
				+
			
 
				+CFLAGS =	$(SYSTEM) $(OPTLEVEL) $(TESTTOKEN)
			
 
				+LFLAGS =	#			# loader flags
			
 
				+
			
 
				+TESTTOKEN =	#-DTESTTOKEN#		# define to test the token type
			
 
				+
			
 
				+
			
 
				+#	T E S T   P A R A M E T E R S
			
 
				+
			
 
				+# Parameters for two simple test runs, sim.res and stream.res:
			
 
				+TEST_LANG =	c#			# to test sim_X for language X
			
 
				+TEST_OPT =	-f -r 20#		# options to sim_X
			
 
				+TEST_INP =	pass3.c#		# guinea pig input
			
 
				+
			
 
				+TEST_OPT =	-p#			# options to sim_X
			
 
				+TEST_INP =	*.l#			# guinea pig input
			
 
				+TEST_INP =	simple*#		# guinea pig input
			
 
				+
			
 
				+
			
 
				+#	I N T R O D U C T I O N
			
 
				+
			
 
				+#	Each module (set of programs that together perform some function)
			
 
				+#	has the following sets of files defined for it:
			
 
				+#		_FLS	all files of that module, for, e.g.,
			
 
				+#			sharring, inventory, etc.
			
 
				+#		_SRC	the source files, from which other files derive
			
 
				+#		_CFS	the C-files, from which the object files derive
			
 
				+#		_OBJ	object files
			
 
				+#		_GRB	garbage files produced by compiling the module
			
 
				+#
			
 
				+#	(This is a feeble attempt at software-engineering a Makefile.)
			
 
				+#
			
 
				+
			
 
				+test_sim:	sim.res stream.res	# two simple tests
			
 
				+
			
 
				+
			
 
				+#	B I N A R I E S
			
 
				+
			
 
				+BINARIES =	sim_c$(EXE) sim_java$(EXE) sim_pasc$(EXE) \
			
 
				+		sim_m2$(EXE) sim_lisp$(EXE) sim_mira$(EXE) \
			
 
				+		sim_text$(EXE)
			
 
				+
			
 
				+all:		$(BINARIES)
			
 
				+
			
 
				+
			
 
				+#	C O M P I L A T I O N   R U L E S
			
 
				+
			
 
				+.SUFFIXES:	.o
			
 
				+.c.o:
			
 
				+		$(CC) -c $(CFLAGS) $<
			
 
				+
			
 
				+
			
 
				+#	A U X I L I A R Y   M O D U L E S
			
 
				+
			
 
				+# Common modules:
			
 
				+COM_CFS =	token.c lex.c stream.c text.c tokenarray.c error.c
			
 
				+COM_OBJ =	token.o lex.o stream.o text.o tokenarray.o error.o
			
 
				+COM_SRC =	token.h lex.h stream.h text.h tokenarray.h error.h \
			
 
				+		lang.h language.h \
			
 
				+		sortlist.spc sortlist.bdy system.par $(COM_CFS)
			
 
				+COM_FLS =	$(COM_SRC)
			
 
				+
			
 
				+# The idf module:
			
 
				+IDF_CFS =	idf.c
			
 
				+IDF_OBJ =	idf.o
			
 
				+IDF_SRC =	idf.h $(IDF_CFS)
			
 
				+IDF_FLS =	$(IDF_SRC)
			
 
				+
			
 
				+# The runs package:
			
 
				+RUNS_CFS =	runs.c percentages.c
			
 
				+RUNS_OBJ =	runs.o percentages.o
			
 
				+RUNS_SRC =	runs.h percentages.h $(RUNS_CFS)
			
 
				+RUNS_FLS =	$(RUNS_SRC) aiso.spc aiso.bdy
			
 
				+
			
 
				+# The main program:
			
 
				+MAIN_CFS =	sim.c options.c pass1.c hash.c compare.c add_run.c \
			
 
				+		pass2.c pass3.c
			
 
				+MAIN_OBJ =	sim.o options.o pass1.o hash.o compare.o add_run.o \
			
 
				+		pass2.o pass3.o
			
 
				+MAIN_SRC =	sim.h options.h pass1.h hash.h compare.h add_run.h \
			
 
				+		pass2.h pass3.h \
			
 
				+		debug.par settings.par $(MAIN_CFS)
			
 
				+MAIN_FLS =	$(MAIN_SRC)
			
 
				+
			
 
				+# The similarity tester without the language part:
			
 
				+SIM_CFS =	$(COM_CFS) $(IDF_CFS) $(RUNS_CFS) $(MAIN_CFS)
			
 
				+SIM_OBJ =	$(COM_OBJ) $(IDF_OBJ) $(RUNS_OBJ) $(MAIN_OBJ)
			
 
				+SIM_SRC =	$(COM_SRC) $(IDF_SRC) $(RUNS_SRC) $(MAIN_SRC)
			
 
				+SIM_FLS =	$(COM_FLS) $(IDF_FLS) $(RUNS_FLS) $(MAIN_FLS)
			
 
				+
			
 
				+
			
 
				+#	L A N G U A G E S
			
 
				+
			
 
				+# The algollike module:
			
 
				+ALG_CFS =	algollike.c
			
 
				+ALG_OBJ =	algollike.o
			
 
				+ALG_SRC =	algollike.h $(ALG_CFS)
			
 
				+ALG_FLS =	$(ALG_SRC)
			
 
				+
			
 
				+# The C Language module:					C
			
 
				+CLANG_CFS =	clang.c
			
 
				+CLANG_OBJ =	clang.o
			
 
				+CLANG_SRC =	clang.l
			
 
				+CLANG_FLS =	$(CLANG_SRC)
			
 
				+
			
 
				+clang.c:	clang.l
			
 
				+		$(LEX) -t clang.l >$@
			
 
				+
			
 
				+SIM_C_CFS =	$(SIM_CFS) $(ALG_CFS) $(CLANG_CFS)
			
 
				+SIM_C_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(CLANG_OBJ)
			
 
				+
			
 
				+sim_c$(EXE):	$(SIM_C_OBJ)
			
 
				+		$(CC) $(LFLAGS) $(SIM_C_OBJ) -o $@
			
 
				+
			
 
				+SIM_C_GRB =	clang.c sim_c
			
 
				+
			
 
				+install.sim_c:	$(BINDIR)/sim_c$(EXE) $(MANDIR)/sim.1
			
 
				+
			
 
				+$(BINDIR)/sim_c$(EXE):	sim_c$(EXE)
			
 
				+		$(COPY) sim_c$(EXE) $@
			
 
				+
			
 
				+# The Java Language module:					Java
			
 
				+JAVALANG_CFS =	javalang.c
			
 
				+JAVALANG_OBJ =	javalang.o
			
 
				+JAVALANG_SRC =	javalang.l
			
 
				+JAVALANG_FLS =	$(JAVALANG_SRC)
			
 
				+
			
 
				+javalang.c:	javalang.l
			
 
				+		$(LEX) -t javalang.l >$@
			
 
				+
			
 
				+SIM_JAVA_CFS =	$(SIM_CFS) $(ALG_CFS) $(JAVALANG_CFS)
			
 
				+SIM_JAVA_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(JAVALANG_OBJ)
			
 
				+
			
 
				+sim_java$(EXE):	$(SIM_JAVA_OBJ)
			
 
				+		$(CC) $(LFLAGS) $(SIM_JAVA_OBJ) -o $@
			
 
				+
			
 
				+SIM_JAVA_GRB =	javalang.c sim_java
			
 
				+
			
 
				+install.sim_java:	$(BINDIR)/sim_java$(EXE) $(MANDIR)/sim.1
			
 
				+
			
 
				+$(BINDIR)/sim_java$(EXE):	sim_java$(EXE)
			
 
				+		$(COPY) sim_java$(EXE) $@
			
 
				+
			
 
				+# The Pascal Language module:					Pascal
			
 
				+PASCLANG_CFS =	pascallang.c
			
 
				+PASCLANG_OBJ =	pascallang.o
			
 
				+PASCLANG_SRC =	pascallang.l
			
 
				+PASCLANG_FLS =	$(PASCLANG_SRC)
			
 
				+
			
 
				+pascallang.c:	pascallang.l
			
 
				+		$(LEX) -t pascallang.l >pascallang.c
			
 
				+
			
 
				+SIM_PASC_CFS =	$(SIM_CFS) $(ALG_CFS) $(PASCLANG_CFS)
			
 
				+SIM_PASC_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(PASCLANG_OBJ)
			
 
				+
			
 
				+sim_pasc$(EXE):	$(SIM_PASC_OBJ)
			
 
				+		$(CC) $(LFLAGS) $(SIM_PASC_OBJ) -o $@
			
 
				+
			
 
				+SIM_PASC_GRB =	pascallang.c sim_pasc
			
 
				+
			
 
				+install.sim_pasc:	$(BINDIR)/sim_pasc$(EXE) $(MANDIR)/sim.1
			
 
				+
			
 
				+$(BINDIR)/sim_pasc$(EXE):	sim_pasc$(EXE)
			
 
				+		$(COPY) sim_pasc$(EXE) $@
			
 
				+
			
 
				+# The Modula-2 Language module:					Modula-2
			
 
				+M2LANG_CFS =	m2lang.c
			
 
				+M2LANG_OBJ =	m2lang.o
			
 
				+M2LANG_SRC =	m2lang.l
			
 
				+M2LANG_FLS =	$(M2LANG_SRC)
			
 
				+
			
 
				+m2lang.c:	m2lang.l
			
 
				+		$(LEX) -t m2lang.l >$@
			
 
				+
			
 
				+SIM_M2_CFS =	$(SIM_CFS) $(ALG_CFS) $(M2LANG_CFS)
			
 
				+SIM_M2_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(M2LANG_OBJ)
			
 
				+
			
 
				+sim_m2$(EXE):	$(SIM_M2_OBJ)
			
 
				+		$(CC) $(LFLAGS) $(SIM_M2_OBJ) -o $@
			
 
				+
			
 
				+SIM_M2_GRB =	m2lang.c sim_m2
			
 
				+
			
 
				+install.sim_m2:	$(BINDIR)/sim_m2$(EXE) $(MANDIR)/sim.1
			
 
				+
			
 
				+$(BINDIR)/sim_m2$(EXE):	sim_m2$(EXE)
			
 
				+		$(COPY) sim_m2$(EXE) $@
			
 
				+
			
 
				+# The Lisp Language module:					Lisp
			
 
				+LISPLANG_CFS =	lisplang.c
			
 
				+LISPLANG_OBJ =	lisplang.o
			
 
				+LISPLANG_SRC =	lisplang.l
			
 
				+LISPLANG_FLS =	$(LISPLANG_SRC)
			
 
				+
			
 
				+lisplang.c:	lisplang.l
			
 
				+		$(LEX) -t lisplang.l >$@
			
 
				+
			
 
				+SIM_LISP_CFS =	$(SIM_CFS) $(ALG_CFS) $(LISPLANG_CFS)
			
 
				+SIM_LISP_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(LISPLANG_OBJ)
			
 
				+
			
 
				+sim_lisp$(EXE):	$(SIM_LISP_OBJ)
			
 
				+		$(CC) $(LFLAGS) $(SIM_LISP_OBJ) -o $@
			
 
				+
			
 
				+SIM_LISP_GRB =	lisplang.c sim_lisp
			
 
				+
			
 
				+install.sim_lisp:	$(BINDIR)/sim_lisp$(EXE) $(MANDIR)/sim.1
			
 
				+
			
 
				+$(BINDIR)/sim_lisp$(EXE):	sim_lisp$(EXE)
			
 
				+		$(COPY) sim_lisp$(EXE) $@
			
 
				+
			
 
				+# The Miranda Language module:					Miranda
			
 
				+MIRALANG_CFS =	miralang.c
			
 
				+MIRALANG_OBJ =	miralang.o
			
 
				+MIRALANG_SRC =	miralang.l
			
 
				+MIRALANG_FLS =	$(MIRALANG_SRC)
			
 
				+
			
 
				+miralang.c:	miralang.l
			
 
				+		$(LEX) -t miralang.l >$@
			
 
				+
			
 
				+SIM_MIRA_CFS =	$(SIM_CFS) $(ALG_CFS) $(MIRALANG_CFS)
			
 
				+SIM_MIRA_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(MIRALANG_OBJ)
			
 
				+
			
 
				+sim_mira$(EXE):	$(SIM_MIRA_OBJ)
			
 
				+		$(CC) $(LFLAGS) $(SIM_MIRA_OBJ) -o $@
			
 
				+
			
 
				+SIM_MIRA_GRB =	miralang.c sim_mira
			
 
				+
			
 
				+install.sim_mira:	$(BINDIR)/sim_mira$(EXE) $(MANDIR)/sim.1
			
 
				+
			
 
				+$(BINDIR)/sim_mira$(EXE):	sim_mira$(EXE)
			
 
				+		$(COPY) sim_mira$(EXE) $@
			
 
				+
			
 
				+# The Text module:						Text
			
 
				+TEXTLANG_CFS =	textlang.c
			
 
				+TEXTLANG_OBJ =	textlang.o
			
 
				+TEXTLANG_SRC =	textlang.l
			
 
				+TEXTLANG_FLS =	$(TEXTLANG_SRC)
			
 
				+
			
 
				+textlang.c:	textlang.l
			
 
				+		$(LEX) -t textlang.l >$@
			
 
				+
			
 
				+SIM_TEXT_CFS =	$(SIM_CFS) $(TEXTLANG_CFS)
			
 
				+SIM_TEXT_OBJ =	$(SIM_OBJ) $(TEXTLANG_OBJ)
			
 
				+
			
 
				+sim_text$(EXE):	$(SIM_TEXT_OBJ)
			
 
				+		$(CC) $(LFLAGS) $(SIM_TEXT_OBJ) -o $@
			
 
				+
			
 
				+SIM_TEXT_GRB =	textlang.c sim_text
			
 
				+
			
 
				+install.sim_text:	$(BINDIR)/sim_text$(EXE) $(MANDIR)/sim.1
			
 
				+
			
 
				+$(BINDIR)/sim_text$(EXE):	sim_text$(EXE)
			
 
				+		$(COPY) sim_text$(EXE) $@
			
 
				+
			
 
				+
			
 
				+#	T E S T S
			
 
				+
			
 
				+# Some simple tests:
			
 
				+sim.res:	sim_$(TEST_LANG)$(EXE) $(TEST_INP)
			
 
				+		./sim_$(TEST_LANG)$(EXE) $(TEST_OPT) $(TEST_INP)
			
 
				+#		./sim_$(TEST_LANG)$(EXE) -x $(TEST_OPT) $(TEST_INP)
			
 
				+
			
 
				+stream.res:	sim_$(TEST_LANG)$(EXE) $(TEST_INP)
			
 
				+		./sim_$(TEST_LANG)$(EXE) -- $(TEST_INP) >stream.res
			
 
				+		wc stream.res $(TEST_INP)
			
 
				+
			
 
				+TEST_GRB =	stream.res
			
 
				+
			
 
				+# More simple tests, on the C version only:
			
 
				+simsim:		sim_c$(EXE) $(SRC)
			
 
				+		./sim_c$(EXE) -fr 20 $(SRC)
			
 
				+
			
 
				+# Lint
			
 
				+lint:		$(SIM_C_CFS)
			
 
				+		$(LINT) $(LINTFLAGS) $(SIM_C_CFS) | grep -v yy
			
 
				+
			
 
				+lint.all:	$(SIM_C_CFS) $(SIM_JAVA_CFS) $(SIM_PASC_CFS) $(SIM_M2_CFS) \
			
 
				+		$(SIM_LISP_CFS) $(SIM_MIRA_CFS) $(SIM_TEXT_CFS)
			
 
				+		$(LINT) $(LINTFLAGS) $(SIM_C_CFS) | grep -v yy
			
 
				+		$(LINT) $(LINTFLAGS) $(SIM_JAVA_CFS) | grep -v yy
			
 
				+		$(LINT) $(LINTFLAGS) $(SIM_PASC_CFS) | grep -v yy
			
 
				+		$(LINT) $(LINTFLAGS) $(SIM_M2_CFS) | grep -v yy
			
 
				+		$(LINT) $(LINTFLAGS) $(SIM_LISP_CFS) | grep -v yy
			
 
				+		$(LINT) $(LINTFLAGS) $(SIM_MIRA_CFS) | grep -v yy
			
 
				+		$(LINT) $(LINTFLAGS) $(SIM_TEXT_CFS) | grep -v yy
			
 
				+
			
 
				+
			
 
				+#	O T H E R   E N T R I E S
			
 
				+
			
 
				+# Sets of files: general, modules, main programs, languages
			
 
				+CFS =		$(SIM_CFS) $(ALG_CFS) \
			
 
				+		$(CLANG_CFS) $(JAVALANG_CFS) $(PASCLANG_CFS) $(M2LANG_CFS) \
			
 
				+		$(LISPLANG_CFS) $(MIRALANG_CFS) $(TEXTLANG_CFS)
			
 
				+OBJ =		$(SIM_OBJ) $(ALG_OBJ) \
			
 
				+		$(CLANG_OBJ) $(JAVALANG_OBJ) $(PASCLANG_OBJ) $(M2LANG_OBJ) \
			
 
				+		$(LISPLANG_OBJ) $(MIRALANG_OBJ) $(TEXTLANG_OBJ)
			
 
				+SRC =		$(SIM_SRC) $(ALG_SRC) \
			
 
				+		$(CLANG_SRC) $(JAVALANG_SRC) $(PASCLANG_SRC) $(M2LANG_SRC) \
			
 
				+		$(LISPLANG_SRC) $(MIRALANG_SRC) $(TEXTLANG_SRC)
			
 
				+FLS =		$(SIM_FLS) $(ALG_FLS) \
			
 
				+		$(CLANG_FLS) $(JAVALANG_FLS) $(PASCLANG_FLS) $(M2LANG_FLS) \
			
 
				+		$(LISPLANG_FLS) $(MIRALANG_FLS) $(TEXTLANG_FLS) \
			
 
				+		sysidf.mk sysidf.msdos sysidf.unix
			
 
				+DOC =		READ_ME READ.ME README.1st sim.1 sim.txt sim.html \
			
 
				+		ChangeLog Answers TechnReport
			
 
				+
			
 
				+ALL_FLS =	Makefile $(FLS) $(DOC)
			
 
				+
			
 
				+# Create .EXE archive for MSDOS
			
 
				+SIM_EXES =	sim_c.exe sim_java.exe sim_pasc.exe sim_m2.exe \
			
 
				+		sim_lisp.exe sim_mira.exe sim_text.exe
			
 
				+DOSZIP =	READ.ME sim.txt $(SIM_EXES)
			
 
				+sim_exes:	$(SIM_EXES)
			
 
				+
			
 
				+simexe.zip:	$(DOSZIP)
			
 
				+		$(ZIP) $@ $(DOSZIP)
			
 
				+
			
 
				+DOS_GRB =	simexe.zip
			
 
				+
			
 
				+# Install and clean scripts
			
 
				+install_all:	install			# just a synonym
			
 
				+install:	install.sim_c install.sim_java install.sim_pasc \
			
 
				+		install.sim_m2 install.sim_lisp install.sim_mira \
			
 
				+		install.sim_text
			
 
				+
			
 
				+$(MANDIR)/sim.1:	sim.1
			
 
				+		$(COPY) sim.1 $@
			
 
				+
			
 
				+FTPFILES =	README.1st READ_ME LICENSE.txt TechnReport
			
 
				+
			
 
				+install_ftp:	$(FTPFILES) simsrc.shr simexe.zip sim.pdf
			
 
				+		cp -p simsrc.shr sim_$(VERSION).shar
			
 
				+		cp -p simexe.zip sim_$(VERSION).zip
			
 
				+		cp -p $(FTPFILES) sim_$(VERSION).shar sim_$(VERSION).zip \
			
 
				+			README.1st READ.ME READ_ME sim.pdf \
			
 
				+			$(FTPDIR)/.
			
 
				+		rm -f sim_$(VERSION).shar sim_$(VERSION).zip
			
 
				+		ls -l $(FTPDIR)/.
			
 
				+
			
 
				+simsrc.shr:	$(ALL_FLS)
			
 
				+		shar $(ALL_FLS) >$@
			
 
				+
			
 
				+simsrc.zip:	$(ALL_FLS)
			
 
				+		$(ZIP) $@ $(ALL_FLS)
			
 
				+
			
 
				+sim.txt:	sim.1
			
 
				+		nroff -man sim.1 | sed 's/.//g' >$@
			
 
				+
			
 
				+sim.pdf:	sim.1
			
 
				+		troff -man sim.1 | devps | ps2pdf -sPAPERSIZE=a4 - $@
			
 
				+
			
 
				+INSTALL_GRB =	simsrc.shr simsrc.zip sim.txt sim.pdf
			
 
				+
			
 
				+depend:		$(CFS)
			
 
				+		makedepend -w 1 -Dlint $(CFS)
			
 
				+
			
 
				+.PHONY:		clean fresh
			
 
				+clean:
			
 
				+		-rm -f *.o
			
 
				+		-rm -f $(SIM_C_GRB)
			
 
				+		-rm -f $(SIM_JAVA_GRB)
			
 
				+		-rm -f $(SIM_PASC_GRB)
			
 
				+		-rm -f $(SIM_M2_GRB)
			
 
				+		-rm -f $(SIM_LISP_GRB)
			
 
				+		-rm -f $(SIM_MIRA_GRB)
			
 
				+		-rm -f $(SIM_TEXT_GRB)
			
 
				+		-rm -f $(TEST_GRB)
			
 
				+		-rm -f $(INSTALL_GRB)
			
 
				+		-rm -f a.out a.exe sim.txt core mon.out
			
 
				+
			
 
				+fresh:		clean
			
 
				+		-rm -f $(DOS_GRB)
			
 
				+		-rm -f *.exe
			
 
				+
			
 
				+#	D E P E N D E N C I E S
			
 
				+
			
 
				+# DO NOT DELETE THIS LINE -- make depend depends on it.
			
 
				+
			
 
				+token.o: token.h
			
 
				+lex.o: token.h
			
 
				+lex.o: lex.h
			
 
				+stream.o: system.par
			
 
				+stream.o: token.h
			
 
				+stream.o: lex.h
			
 
				+stream.o: lang.h
			
 
				+stream.o: stream.h
			
 
				+text.o: debug.par
			
 
				+text.o: sim.h
			
 
				+text.o: token.h
			
 
				+text.o: stream.h
			
 
				+text.o: lex.h
			
 
				+text.o: options.h
			
 
				+text.o: error.h
			
 
				+text.o: text.h
			
 
				+tokenarray.o: error.h
			
 
				+tokenarray.o: lex.h
			
 
				+tokenarray.o: token.h
			
 
				+tokenarray.o: tokenarray.h
			
 
				+error.o: sim.h
			
 
				+error.o: error.h
			
 
				+idf.o: system.par
			
 
				+idf.o: token.h
			
 
				+idf.o: idf.h
			
 
				+runs.o: sim.h
			
 
				+runs.o: runs.h
			
 
				+runs.o: aiso.spc
			
 
				+runs.o: aiso.bdy
			
 
				+percentages.o: sim.h
			
 
				+percentages.o: runs.h
			
 
				+percentages.o: aiso.spc
			
 
				+percentages.o: error.h
			
 
				+percentages.o: percentages.h
			
 
				+percentages.o: sortlist.bdy
			
 
				+sim.o: settings.par
			
 
				+sim.o: sim.h
			
 
				+sim.o: options.h
			
 
				+sim.o: language.h
			
 
				+sim.o: token.h
			
 
				+sim.o: error.h
			
 
				+sim.o: hash.h
			
 
				+sim.o: compare.h
			
 
				+sim.o: pass1.h
			
 
				+sim.o: pass2.h
			
 
				+sim.o: pass3.h
			
 
				+sim.o: stream.h
			
 
				+sim.o: lex.h
			
 
				+options.o: options.h
			
 
				+pass1.o: debug.par
			
 
				+pass1.o: sim.h
			
 
				+pass1.o: text.h
			
 
				+pass1.o: tokenarray.h
			
 
				+pass1.o: token.h
			
 
				+pass1.o: lex.h
			
 
				+pass1.o: error.h
			
 
				+pass1.o: pass1.h
			
 
				+hash.o: system.par
			
 
				+hash.o: debug.par
			
 
				+hash.o: sim.h
			
 
				+hash.o: error.h
			
 
				+hash.o: language.h
			
 
				+hash.o: token.h
			
 
				+hash.o: tokenarray.h
			
 
				+hash.o: options.h
			
 
				+hash.o: hash.h
			
 
				+compare.o: sim.h
			
 
				+compare.o: tokenarray.h
			
 
				+compare.o: token.h
			
 
				+compare.o: hash.h
			
 
				+compare.o: language.h
			
 
				+compare.o: options.h
			
 
				+compare.o: add_run.h
			
 
				+compare.o: compare.h
			
 
				+add_run.o: sim.h
			
 
				+add_run.o: runs.h
			
 
				+add_run.o: aiso.spc
			
 
				+add_run.o: percentages.h
			
 
				+add_run.o: options.h
			
 
				+add_run.o: error.h
			
 
				+add_run.o: add_run.h
			
 
				+pass2.o: debug.par
			
 
				+pass2.o: sim.h
			
 
				+pass2.o: text.h
			
 
				+pass2.o: lex.h
			
 
				+pass2.o: token.h
			
 
				+pass2.o: pass2.h
			
 
				+pass2.o: sortlist.bdy
			
 
				+pass3.o: system.par
			
 
				+pass3.o: debug.par
			
 
				+pass3.o: sim.h
			
 
				+pass3.o: runs.h
			
 
				+pass3.o: aiso.spc
			
 
				+pass3.o: error.h
			
 
				+pass3.o: options.h
			
 
				+pass3.o: pass3.h
			
 
				+pass3.o: percentages.h
			
 
				+pass3.o: tokenarray.h
			
 
				+pass3.o: token.h
			
 
				+algollike.o: options.h
			
 
				+algollike.o: token.h
			
 
				+algollike.o: algollike.h
			
 
				+algollike.o: language.h
			
 
				+clang.o: options.h
			
 
				+clang.o: algollike.h
			
 
				+clang.o: language.h
			
 
				+clang.o: token.h
			
 
				+clang.o: idf.h
			
 
				+clang.o: lex.h
			
 
				+clang.o: lang.h
			
 
				+javalang.o: options.h
			
 
				+javalang.o: algollike.h
			
 
				+javalang.o: language.h
			
 
				+javalang.o: token.h
			
 
				+javalang.o: idf.h
			
 
				+javalang.o: lex.h
			
 
				+javalang.o: lang.h
			
 
				+pascallang.o: options.h
			
 
				+pascallang.o: algollike.h
			
 
				+pascallang.o: language.h
			
 
				+pascallang.o: token.h
			
 
				+pascallang.o: idf.h
			
 
				+pascallang.o: lex.h
			
 
				+pascallang.o: lang.h
			
 
				+m2lang.o: options.h
			
 
				+m2lang.o: algollike.h
			
 
				+m2lang.o: language.h
			
 
				+m2lang.o: token.h
			
 
				+m2lang.o: idf.h
			
 
				+m2lang.o: lex.h
			
 
				+m2lang.o: lang.h
			
 
				+lisplang.o: language.h
			
 
				+lisplang.o: token.h
			
 
				+lisplang.o: lex.h
			
 
				+lisplang.o: lang.h
			
 
				+lisplang.o: idf.h
			
 
				+miralang.o: language.h
			
 
				+miralang.o: token.h
			
 
				+miralang.o: lex.h
			
 
				+miralang.o: lang.h
			
 
				+miralang.o: idf.h
			
 
				+textlang.o: language.h
			
 
				+textlang.o: token.h
			
 
				+textlang.o: idf.h
			
 
				+textlang.o: lex.h
			
 
				+textlang.o: lang.h
			
--- a/utils/sim_pasc/READ.ME
+++ b/utils/sim_pasc/READ.ME
@@ -0,0 +1,34 @@
 
				+#	This file is part of the software similarity tester SIM.
			
 
				+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+#	$Id: READ.ME,v 2.8 2005/02/20 17:02:58 dick Exp $
			
 
				+
			
 
				+These programs test for similar (or equal) stretches in one or more program
			
 
				+files and can be used to detect common code or plagiarism. See SIM.DOC.
			
 
				+Checkers are available for C, Java, Pascal, Modula-2, Lisp, Miranda and
			
 
				+natural text.
			
 
				+
			
 
				+This READ.ME file describes the MSDOS version. The UNIX version is described
			
 
				+in the file READ_ME.
			
 
				+
			
 
				+The archive SIM_2_21.ZIP contains:
			
 
				+	READ.ME			this READ.ME file
			
 
				+	SIM.TXT			a 2-page manual, UNIX-style
			
 
				+	SIM_C.EXE		similarity tester for C
			
 
				+	SIM_JAVA.EXE		similarity tester for Java
			
 
				+	SIM_PASC.EXE		similarity tester for Pascal
			
 
				+	SIM_M2.EXE		similarity tester for Modula-2
			
 
				+	SIM_LISP.EXE		similarity tester for Lisp
			
 
				+	SIM_MIRA.EXE		similarity tester for Miranda
			
 
				+	SIM_TEXT.EXE		similarity tester for text
			
 
				+
			
 
				+The MSDOS version does not contain sources. The sources are available from
			
 
				+the UNIX archive sim_2_21.shar, but require a C compiler, flex and make.
			
 
				+
			
 
				+					Dick Grune
			
 
				+					Vrije Universiteit
			
 
				+					de Boelelaan 1081
			
 
				+					1081 HV  Amsterdam
			
 
				+					the Netherlands
			
 
				+					email: [email protected]
			
 
				+					ftp://ftp.cs.vu.nl/pub/dick
			
 
				+					http://www.cs.vu.nl/~dick
			
--- a/utils/sim_pasc/README.1st
+++ b/utils/sim_pasc/README.1st
@@ -0,0 +1,68 @@
 
				+This is SIM, Software and text similarity tester, most recent revision
			
 
				+                                                               (2.19, 20050220)
			
 
				+by Dick Grune, Vrije Universiteit, Amsterdam, the Netherlands ([email protected]).
			
 
				+
			
 
				+SIM tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp,
			
 
				+Miranda and natural language. It can be used
			
 
				+
			
 
				+- to detect potentially duplicated code fragments in large software projects,
			
 
				+- to detect plagiarism in software and text-based projects, educational and
			
 
				+  otherwise.
			
 
				+
			
 
				+The program is fast:
			
 
				+the UNIX version on a Sun ULTRA does about 50000 tokens/sec,
			
 
				+the DOS version on a Pentium 166 does about 25000 tokens/sec.
			
 
				+
			
 
				+SIM is available for UNIX (in source code) and MSDOS (32-bit executables).
			
 
				+
			
 
				+UNIX:
			
 
				+	To obtain the files, do:
			
 
				+		sh sim_2_21.shar
			
 
				+	This unpacks the sources, the Makefile, sim.1 and READ_ME.
			
 
				+	For installation notes and other info then see READ_ME.
			
 
				+
			
 
				+MSDOS:
			
 
				+	To obtain the files, do:
			
 
				+		[pk]unzip SIM_2_21.zip
			
 
				+	This unpacks the executables, SIM.DOC and READ.ME.
			
 
				+	For other info then see READ.ME.
			
 
				+
			
 
				+Changes from Release 2.19:
			
 
				+	Various changes necessitated by Linux flex being different
			
 
				+
			
 
				+Changes from Release 2.16:
			
 
				+	Various updates and adjustments in the code and the installation
			
 
				+	procedure.
			
 
				+
			
 
				+Changes from Release 2.13:
			
 
				+	Percentage reporting feature added.
			
 
				+
			
 
				+Changes from Release 2.12:
			
 
				+	Miranda checker added.
			
 
				+
			
 
				+Changes from Release 2.9:
			
 
				+	Java checker added.
			
 
				+	The C checker 'sim' was renamed to 'sim_c', for uniformity.
			
 
				+	Converted the sources to ANSI C.
			
 
				+	All versions now report non_ASCI characters in the input.
			
 
				+
			
 
				+Changes from Release 2.8:
			
 
				+	DOS versions can now compare very large files (>400000 tokens)
			
 
				+
			
 
				+Changes from Release 1.21, as posted in comp.sources.unix (1987):
			
 
				+	Ported to MSDOS
			
 
				+	Significant speed improvements
			
 
				+	New options: -e, -S and / , to compare files group-wise
			
 
				+	New option: -F , to require function names to match exactly
			
 
				+	Lisp version added
			
 
				+	Miscellaneous improvements
			
 
				+
			
 
				+
			
 
				+					Dick Grune
			
 
				+					Vrije Universiteit
			
 
				+					de Boelelaan 1081
			
 
				+					1081 HV  Amsterdam
			
 
				+					the Netherlands
			
 
				+					email: [email protected]
			
 
				+					ftp://ftp.cs.vu.nl/pub/dick
			
 
				+					http://www.cs.vu.nl/~dick
			
--- a/utils/sim_pasc/READ_ME
+++ b/utils/sim_pasc/READ_ME
@@ -0,0 +1,52 @@
 
				+#	This file is part of the software similarity tester SIM.
			
 
				+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+#	$Id: READ_ME,v 2.6 2005/02/20 17:02:59 dick Exp $
			
 
				+
			
 
				+These programs test for similar (or equal) stretches in one or more program
			
 
				+files and can be used to detect common code or plagiarism. See sim.1.
			
 
				+Checkers are available for C, Java, Pascal, Modula-2, Lisp, Miranda and
			
 
				+natural text.
			
 
				+
			
 
				+This READ_ME file describes the UNIX version. The MSDOS version is described
			
 
				+in the file READ.ME.
			
 
				+
			
 
				+To obtain the sources, do
			
 
				+	sh sim_2_21.shar
			
 
				+
			
 
				+To compile and test, just call
			
 
				+	make
			
 
				+This will generate one executable called sim_c, the checker for C, and will
			
 
				+run two small tests to show sample output.
			
 
				+
			
 
				+To install, examine sysidf.mk, reset BINDIR and MANDIR to sensible paths,
			
 
				+and call
			
 
				+	make install.sim_c			for C
			
 
				+	make install.sim_java			for Java
			
 
				+	make install.sim_pasc			for Pascal
			
 
				+	make install.sim_m2			for Modula-2
			
 
				+	make install.sim_lisp			for Lisp
			
 
				+	make install.sim_mira			for Miranda
			
 
				+	make install.sim_text			for text
			
 
				+or
			
 
				+	make install.all			for everything.
			
 
				+These will also install the manual page.
			
 
				+
			
 
				+To change the default run size or the page width, adjust the file params.h
			
 
				+and recompile.
			
 
				+
			
 
				+To add another language L, write a file Llang.l along the lines of clang.l
			
 
				+and the other *lang.l files, extend the Makefile and recompile.
			
 
				+All knowledge about a given language L is located in Llang.l; the rest of
			
 
				+the programs expect each token to be a single character.
			
 
				+
			
 
				+Available at present:
			
 
				+	clang.l javalang.l pascallang.l m2lang.l lisplang.l miralang.l text.l
			
 
				+
			
 
				+					Dick Grune
			
 
				+					Vrije Universiteit
			
 
				+					de Boelelaan 1081
			
 
				+					1081 HV  Amsterdam
			
 
				+					the Netherlands
			
 
				+					email: [email protected]
			
 
				+					ftp://ftp.cs.vu.nl/pub/dick
			
 
				+					http://www.cs.vu.nl/~dick
			
--- a/utils/sim_pasc/TechnReport
+++ b/utils/sim_pasc/TechnReport
@@ -0,0 +1,214 @@
 
				+		CONCISE REPORT ON THE ALGORITHMS IN SIM			970623
			
 
				+
			
 
				+
			
 
				+
			
 
				+	INTRODUCTION
			
 
				+
			
 
				+The general outline of the similarity checker is as follows:
			
 
				+
			
 
				+	1. the files are read in (pass 1)
			
 
				+	2. a forward-reference table is prepared
			
 
				+	3. the set of interesting runs is determined
			
 
				+	4. the line numbers of the runs are determined (pass 2)
			
 
				+	5. the contents of the runs are printed in order (pass 3)
			
 
				+
			
 
				+To keep the memory requirements (relatively) small, the exact positions
			
 
				+of the tokens are not recorded.  This necessitates pass 2.  See, however,
			
 
				+the pertinent chapter.
			
 
				+
			
 
				+
			
 
				+	READING THE FILES
			
 
				+
			
 
				+Each file is tokenized using an lex-generated scanner appropriate for
			
 
				+the input.  Each token fits in one byte, possibly using all 8 bits.  The
			
 
				+tokens are stored in the array TokenArray[], which is extended by
			
 
				+reallocation if it overflows.  See tokenarray.c.
			
 
				+
			
 
				+Also, to optimize away pass 2, an attempt is made to remember the token
			
 
				+positions of all beginnings of lines.  The token-positions at BOL are
			
 
				+stored in the array nl_buff[], which is also extended by reallocation,
			
 
				+if needed.  If the attempt fails due to lack of memory, nl_buff[] is
			
 
				+abandoned, and pass2 will read the files instead.
			
 
				+
			
 
				+
			
 
				+	PREPARING THE FORWARD-REFERENCE TABLE
			
 
				+
			
 
				+Text is compared by comparing every substring to all substrings
			
 
				+to the right of it; this process is in essence quadratic.  However,
			
 
				+only substrings of length at least 'MinRunSize' are of interest,
			
 
				+which gives us the possibility to speed up this process by using
			
 
				+a hash table.
			
 
				+
			
 
				+Once the entire text has been read in, a forward-reference table
			
 
				+forward_references[] is made (see hash.c).
			
 
				+For every position in the text, we construct an index which gives
			
 
				+the next position in the text where a run of MinRunSize tokens
			
 
				+starts that has the same hash code.  If there is no such run, the
			
 
				+index is 0.
			
 
				+
			
 
				+To fill in this array, we use a hash table last_index[], such that
			
 
				+last_index[i] is the index of the latest token with hash_code i, or 0 if
			
 
				+there is none.  If at a given position p, we find that the text ahead of
			
 
				+us has hash code i, last_index[i] tells us which position in
			
 
				+forward_references[] will have to be updated to p.
			
 
				+See MakeForwardReferences().
			
 
				+
			
 
				+For long text sequences (say hundreds of thousands of tokens), the
			
 
				+hashing is not really efficient any more since too many spurious matches
			
 
				+occur.  Therefore, the forward reference table is scanned a second time,
			
 
				+eliminating from any chain all references to runs that do not start with
			
 
				+and end in the same token (actually this is a second hash code).
			
 
				+For the UNIX manuals this reduced the number of matches from 91.9% to 1.9%
			
 
				+(of which 0.06% was genuine).
			
 
				+
			
 
				+	DETERMINING THE SET OF INTERESTING RUNS
			
 
				+
			
 
				+The overall structure of the routine Compare() (see compare.c) is:
			
 
				+
			
 
				+for all new files
			
 
				+	for all texts it must be compared to
			
 
				+		for all positions in the new file
			
 
				+			for all positions in the text
			
 
				+				for ever increasing sizes
			
 
				+					try to match and keep the best
			
 
				+
			
 
				+If for a given position in the new file a good run (i.e. on of at least
			
 
				+minimum length) has been found, the run is registered using a call of
			
 
				+add_run(), the run is skipped in the new file and searching continues at
			
 
				+the position after it.  This prevents duplicate reports of runs.
			
 
				+
			
 
				+Add_run() allocates a struct run for the run (see sim.h)
			
 
				+which contains two struct chunks and a quality description.  It fills
			
 
				+in the two chunks with the pertinent info, one for the first file and
			
 
				+one for the second (which may be the same, if the run relates two chunks
			
 
				+in the same file).
			
 
				+
			
 
				+The run is then entered into the arbitrary-in-sorted-out store AISO (see
			
 
				+aiso.spc and aiso.bdy, a genuine generic abstract data type in C!), in
			
 
				+which it is inserted according to its quality.  Both positions
			
 
				+(struct position) in both chunks in the run (so four in total) are each
			
 
				+entered in a linked list starting at the tx_pos field in the struct text
			
 
				+of the appropriate file.
			
 
				+
			
 
				+When this is finished, the forward reference table can be deleted.
			
 
				+
			
 
				+So the final results of this phase are visible both through the tx_pos
			
 
				+fields and through the aiso interface.
			
 
				+
			
 
				+
			
 
				+	DETERMINING THE EXACT POSITION OF EACH RUN (PASS 2)
			
 
				+
			
 
				+The purpose of this pass is to find for each chunk, which up to now is
			
 
				+known by token position only, its starting and ending line number (which
			
 
				+cannot be easily derived from the token position).
			
 
				+
			
 
				+For each file that has a non-zero tx_pos field, ie. that has some
			
 
				+interesting chunks, the positions in the tx_pos list are sorted on
			
 
				+ascending line number (they have been found in essentially arbitrary
			
 
				+order) by sort_pos() in pass2.c.
			
 
				+
			
 
				+Next we scan the pos list and the file in parallel, updating the info in
			
 
				+a position when we meet it.  A position carries an indication whether it
			
 
				+is a starting or an ending position, since slightly differing
			
 
				+calculations have to be done in each case.
			
 
				+
			
 
				+Actually, if the nl_buff[] data structure still exists, the file is not
			
 
				+accessed at all and the data from nl_buff[] is used instead.  This is
			
 
				+done transparently in buff.c.
			
 
				+
			
 
				+
			
 
				+	PRINTING THE CONTENTS OF THE RUNS (PASS 3)
			
 
				+
			
 
				+Since each struct run has now been completely filled in, this is simple;
			
 
				+the hard work is calculating the page layout.
			
 
				+Pass3() accesses the aiso store and retrieves from it the runs in
			
 
				+descending order of importance.  Show_run() opens both files, positions
			
 
				+them using the line numbers and prints the runs.
			
 
				+
			
 
				+================================================================
			
 
				+	CODE EXCERPT OF THE SOFTWARE SIMILARITY TESTER SIM (980222)
			
 
				+
			
 
				+sim:
			
 
				+	get command line options
			
 
				+	check the options
			
 
				+
			
 
				+	init language, to precompute tables
			
 
				+
			
 
				+	pass1, read the files
			
 
				+		# there is an array TokenArray[] that holds all input tokens
			
 
				+
			
 
				+	make forward reference table
			
 
				+		# there is an array forward_references[], with one entry for
			
 
				+		#   each token in the input; forward_references[i] gives the
			
 
				+		#   token number where a token sequence starts with the same
			
 
				+		#   hash value as the one starting at i
			
 
				+
			
 
				+	compare various files to find runs
			
 
				+	delete forward reference table
			
 
				+	pass2, find newline positions of found similarities
			
 
				+	pass3, print the similarities
			
 
				+
			
 
				+
			
 
				+
			
 
				+pass1, read the files:
			
 
				+	for each file
			
 
				+		divide the text into tokens
			
 
				+		store all tokens except newlines in TokenArray and try to
			
 
				+			keep a record of the newline positions
			
 
				+
			
 
				+
			
 
				+
			
 
				+make forward reference table:
			
 
				+	# there are two independent hash functions, hash1() and hash2().
			
 
				+	#   hash1(i) gives the hash value of the token sequence starting at i
			
 
				+	#   likewise for hash2(i)
			
 
				+
			
 
				+	set up the forward references using the last_index table:
			
 
				+		# there is an array last_index[], with one entry for each
			
 
				+		#   possible hash value; last_index[i] gives the position in
			
 
				+		#   forward_references[] at which i was most recently
			
 
				+		#   encountered as a hash value
			
 
				+		for each file
			
 
				+			for all positions in file except the last MinRunSize
			
 
				+				set forward_references[] and update last_index[]
			
 
				+
			
 
				+	use hash2() to clean out matches:
			
 
				+		for all tokens
			
 
				+			find first token in chain with same hash2 code
			
 
				+			short-circuit forward reference to it
			
 
				+
			
 
				+
			
 
				+
			
 
				+compare:
			
 
				+	for all new files
			
 
				+		for all texts it must be compared to
			
 
				+			for all positions in the new file
			
 
				+				for all positions in the text
			
 
				+					for ever increasing sizes
			
 
				+						try to match and keep the best
			
 
				+	try to match and keep the best:
			
 
				+		# using forward_references[], we find a list of positions in
			
 
				+		#   which a matching token sequence will start;
			
 
				+		#   scanning this list, we measure the maximum length of the
			
 
				+		#   match and add the longest match to the run collection
			
 
				+
			
 
				+
			
 
				+
			
 
				+pass2, find positions of found runs:
			
 
				+	for all files:
			
 
				+		sort the positions in the runs
			
 
				+
			
 
				+		# we scan the pos list and the file in parallel
			
 
				+		for all positions inside this file
			
 
				+			if it matches a token position in a run
			
 
				+				record line number
			
 
				+
			
 
				+
			
 
				+
			
 
				+pass3, print the similarities:
			
 
				+	for all runs
			
 
				+		# a run consists of two chunks
			
 
				+		open the files that hold the chunks and position them
			
 
				+		  at the beginning of the chunk
			
 
				+		display the chunks
			
 
				+
			
--- a/utils/sim_pasc/add_run.c
+++ b/utils/sim_pasc/add_run.c
@@ -0,0 +1,70 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: add_run.c,v 2.5 2001/11/08 12:30:28 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<malloc.h>
			
 
				+
			
 
				+#include	"sim.h"
			
 
				+#include	"runs.h"
			
 
				+#include	"percentages.h"
			
 
				+#include	"options.h"
			
 
				+#include	"error.h"
			
 
				+#include	"add_run.h"
			
 
				+
			
 
				+static void set_chunk(
			
 
				+	struct chunk *,
			
 
				+	struct text *,
			
 
				+	unsigned int,
			
 
				+	unsigned int
			
 
				+);
			
 
				+
			
 
				+static void set_pos(
			
 
				+	struct position *,
			
 
				+	int,
			
 
				+	struct text *,
			
 
				+	unsigned int
			
 
				+);
			
 
				+
			
 
				+void
			
 
				+add_run(struct text *txt0, unsigned int i0,
			
 
				+	struct text *txt1, unsigned int i1,
			
 
				+	unsigned int size
			
 
				+) {
			
 
				+	/*	Adds the run of given size to our collection.
			
 
				+	*/
			
 
				+	register struct run *r = (struct run *)malloc(sizeof (struct run));
			
 
				+
			
 
				+	if (!r) fatal("out of memory");
			
 
				+	set_chunk(&r->rn_cn0, txt0, i0 - txt0->tx_start, size);
			
 
				+	set_chunk(&r->rn_cn1, txt1, i1 - txt1->tx_start, size);
			
 
				+	r->rn_size = size;
			
 
				+
			
 
				+	if (option_set('p') ? add_to_percentages(r) : add_to_runs(r)) {
			
 
				+		/* OK */
			
 
				+	}
			
 
				+	else	fatal("out of memory");
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+set_chunk(struct chunk *cnk, struct text *txt,
			
 
				+	  unsigned int start, unsigned int size
			
 
				+) {
			
 
				+	/*	Fill the chunk *cnk with info about the piece of text
			
 
				+		in txt starting at start extending over size tokens.
			
 
				+	*/
			
 
				+	cnk->ch_text = txt;
			
 
				+	set_pos(&cnk->ch_first, 0, txt, start);
			
 
				+	set_pos(&cnk->ch_last, 1, txt, start + size - 1);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+set_pos(struct position *pos, int type, struct text *txt, unsigned int start) {
			
 
				+	/* Fill a single struct position */
			
 
				+	pos->ps_next = txt->tx_pos;
			
 
				+	txt->tx_pos = pos;
			
 
				+
			
 
				+	pos->ps_type = type;
			
 
				+	pos->ps_tk_cnt = start;
			
 
				+	pos->ps_nl_cnt = -1;		/* uninitialized */
			
 
				+}
			
--- a/utils/sim_pasc/add_run.h
+++ b/utils/sim_pasc/add_run.h
@@ -0,0 +1,19 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: add_run.h,v 1.1 2001/09/28 09:03:39 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Interface between front-end and back-end: all information about
			
 
				+	runs passes through add_run().  Its parameters are the two chunks,
			
 
				+	each identified by their struct text and the position of the common
			
 
				+	segment in TokenArray[], and the number of tokens in the common
			
 
				+	segment.
			
 
				+*/
			
 
				+
			
 
				+void add_run(
			
 
				+	struct text *txt0,		/* text of first chunk */
			
 
				+	unsigned int i0,		/* chunk position in TokenArray[] */
			
 
				+	struct text *txt1,		/* text of second chunk */
			
 
				+	unsigned int i1,		/* chunk position in TokenArray[] */
			
 
				+	unsigned int size		/* number of tokens in the chunk */
			
 
				+);
			
--- a/utils/sim_pasc/aiso.bdy
+++ b/utils/sim_pasc/aiso.bdy
@@ -0,0 +1,186 @@
 
				+/*
			
 
				+	Module:	Arbitrary-In Sorted-Out (AISO)
			
 
				+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
			
 
				+
			
 
				+Description:
			
 
				+	This is the body of a module that builds an arbitrary-in
			
 
				+	sorted-out data structure, to be used as a heap, a priority queue, etc.
			
 
				+	See aiso.spc for further info.
			
 
				+*/
			
 
				+
			
 
				+#include	<malloc.h>
			
 
				+
			
 
				+static struct aiso_node *root;		/* root of tree */
			
 
				+#ifdef	AISO_ITERATOR
			
 
				+static struct aiso_node *list;		/* start of linked list */
			
 
				+#endif	/* AISO_ITERATOR */
			
 
				+
			
 
				+/* the policy */
			
 
				+static int aiso_size = 0;
			
 
				+static int access_mark = 1;
			
 
				+
			
 
				+#define	add_entry()	(aiso_size++)
			
 
				+#define	remove_entry()	(aiso_size--)
			
 
				+#define	reset_access()	(access_mark = 1)
			
 
				+#define	count_access()	(access_mark <<= 1)
			
 
				+#define	must_rotate()	(access_mark > aiso_size)
			
 
				+
			
 
				+int
			
 
				+InsertAiso(AISO_TYPE v) {
			
 
				+	register struct aiso_node *new_node;
			
 
				+	register struct aiso_node **hook = &root;
			
 
				+#ifdef	AISO_ITERATOR
			
 
				+	register struct aiso_node **prev = &list;
			
 
				+#endif	/* AISO_ITERATOR */
			
 
				+
			
 
				+	new_node = (struct aiso_node *)malloc(sizeof (struct aiso_node));
			
 
				+	if (!new_node) {
			
 
				+		/* avoid modifying the tree */
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	while (*hook) {
			
 
				+		register struct aiso_node *an = *hook;
			
 
				+
			
 
				+		count_access();
			
 
				+		if (AISO_BEFORE(v, an->an_value)) {
			
 
				+			/* head left */
			
 
				+			if (!an->an_left || !must_rotate()) {
			
 
				+				/* standard action */
			
 
				+				hook = &an->an_left;
			
 
				+			}
			
 
				+			else {
			
 
				+				/* change (l A r) B (C) into (l) A (r B C) */
			
 
				+				register struct aiso_node *anl = an->an_left;
			
 
				+
			
 
				+				an->an_left = anl->an_right;
			
 
				+				anl->an_right = an;
			
 
				+				*hook = anl;
			
 
				+				reset_access();
			
 
				+			}
			
 
				+		}
			
 
				+		else {
			
 
				+			/* head right */
			
 
				+			if (!an->an_right || !must_rotate()) {
			
 
				+				/* standard action */
			
 
				+				hook = &an->an_right;
			
 
				+			}
			
 
				+			else {
			
 
				+				/* change (A) B (l C r) into (A B l) C (r) */
			
 
				+				register struct aiso_node *anr = an->an_right;
			
 
				+
			
 
				+				an->an_right = anr->an_left;
			
 
				+				anr->an_left = an;
			
 
				+				*hook = anr;
			
 
				+				reset_access();
			
 
				+			}
			
 
				+#ifdef	AISO_ITERATOR
			
 
				+			prev = &an->an_next;
			
 
				+#endif	/* AISO_ITERATOR */
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	new_node->an_left = 0;
			
 
				+	new_node->an_right = 0;
			
 
				+#ifdef	AISO_ITERATOR
			
 
				+	new_node->an_next = *prev;
			
 
				+	*prev = new_node;
			
 
				+#endif	/* AISO_ITERATOR */
			
 
				+	new_node->an_value = v;
			
 
				+	*hook = new_node;
			
 
				+	add_entry();
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+#ifdef	AISO_EXTRACTOR
			
 
				+
			
 
				+int
			
 
				+ExtractAiso(AISO_TYPE *vp) {
			
 
				+	register struct aiso_node **hook = &root;
			
 
				+	register struct aiso_node *an;
			
 
				+
			
 
				+	if (!root) return 0;
			
 
				+
			
 
				+	while ((an = *hook), an->an_left) {
			
 
				+		/* head left */
			
 
				+		count_access();
			
 
				+		if (!must_rotate()) {
			
 
				+			/* standard action */
			
 
				+			hook = &an->an_left;
			
 
				+		}
			
 
				+		else {
			
 
				+			/* change (l A r) B (C) into (l) A (r B C) */
			
 
				+			register struct aiso_node *anl = an->an_left;
			
 
				+
			
 
				+			an->an_left = anl->an_right;
			
 
				+			anl->an_right = an;
			
 
				+			*hook = anl;
			
 
				+			reset_access();
			
 
				+		}
			
 
				+	}
			
 
				+	/* found the first */
			
 
				+	*vp = an->an_value;
			
 
				+	*hook = an->an_right;
			
 
				+#ifdef	AISO_ITERATOR
			
 
				+	list = an->an_next;
			
 
				+#endif	/* AISO_ITERATOR */
			
 
				+	free((char *)an);
			
 
				+	remove_entry();
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+#endif	/* AISO_EXTRACTOR */
			
 
				+
			
 
				+#ifdef	AISO_ITERATOR
			
 
				+
			
 
				+void
			
 
				+OpenIter(AisoIter *ip) {
			
 
				+	*ip = list;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+GetAisoItem(AisoIter *ip, AISO_TYPE *vp) {
			
 
				+	register struct aiso_node *an = *ip;
			
 
				+
			
 
				+	if (!an) return 0;
			
 
				+
			
 
				+	*vp = an->an_value;
			
 
				+	*ip = an->an_next;
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+CloseIter(AisoIter *ip) {
			
 
				+	*ip = 0;
			
 
				+}
			
 
				+
			
 
				+#endif	/* AISO_ITERATOR */
			
 
				+
			
 
				+#ifdef	AISO_DEBUG
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+
			
 
				+static void
			
 
				+print_inf(int level, char ch, struct aiso_node *an) {
			
 
				+	register int i;
			
 
				+
			
 
				+	if (!an) return;
			
 
				+
			
 
				+	print_inf(level+1, '/', an->an_right);
			
 
				+	for (i = 0; i < level; i++) {
			
 
				+		printf("     ");
			
 
				+	}
			
 
				+	printf("%c", ch);
			
 
				+	printf(AISO_FORMAT, an->an_value);
			
 
				+	printf("\n");
			
 
				+	print_inf(level+1, '\\', an->an_left);
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+PrintAisoTree(void)
			
 
				+{
			
 
				+	print_inf(0, '-', root);
			
 
				+	printf("================\n");
			
 
				+}
			
 
				+
			
 
				+#endif	/* AISO_DEBUG */
			
--- a/utils/sim_pasc/aiso.spc
+++ b/utils/sim_pasc/aiso.spc
@@ -0,0 +1,102 @@
 
				+/*
			
 
				+	Module:	Arbitrary-In Sorted-Out (AISO)
			
 
				+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
			
 
				+	Version:	Tue Aug 23 12:54:22 1988
			
 
				+
			
 
				+Description:
			
 
				+	This is the specification of a generic module that builds an
			
 
				+	arbitrary-in sorted-out data structure, to be used as a heap, a
			
 
				+	priority queue, etc. Elements can be inserted, the first element
			
 
				+	extracted and the set scanned at any moment.
			
 
				+
			
 
				+Instantiation:
			
 
				+	The module is instantiated as follows.
			
 
				+	Create a file M.h for some M, which contains at least:
			
 
				+	-	a definition of AISO_TYPE, the type of the object to be stored
			
 
				+	-	a possible definition of AISO_EXTRACTOR; see below
			
 
				+	-	a possible definition of AISO_ITERATOR; see below
			
 
				+	-	#include	"aiso.spc"
			
 
				+
			
 
				+	This file M.h is to be included in all files that use the aiso
			
 
				+	package.
			
 
				+
			
 
				+	Create a file M.c which contains at least:
			
 
				+	-	#include	"M.h"
			
 
				+	-	a definition of a routine
			
 
				+			int AISO_BEFORE(AISO_TYPE v, AISO_TYPE w)
			
 
				+		which yields non-zero if v is to be sorted before w
			
 
				+	-	#include	"aiso.bdy"
			
 
				+
			
 
				+	This file compiles into the module object.
			
 
				+
			
 
				+Specification:
			
 
				+	The module always supplies:
			
 
				+	int InsertAiso(AISO_TYPE value)
			
 
				+		inserts value in its proper place; fails if out of memory
			
 
				+
			
 
				+	If AISO_EXTRACTOR is defined, the module will also supply:
			
 
				+	int ExtractAiso(AISO_TYPE *value)
			
 
				+		yields the first value in the aiso and removes it;
			
 
				+		fails if empty
			
 
				+
			
 
				+	If AISO_ITERATOR is defined, the module also supplies a type AisoIter
			
 
				+	which declares an iterator, i.e., a structure that records a position
			
 
				+	in the ordered set, plus routines for manipulating the iterator, thus
			
 
				+	enabling the user to scan the ordered set.  The iterator should be
			
 
				+	declared as:
			
 
				+		AisoIter iter;
			
 
				+	and is manipulated by the following commands:
			
 
				+
			
 
				+	void OpenIter(AisoIter *iter)
			
 
				+		opens the iterator for scanning the existing set in order
			
 
				+
			
 
				+	int GetAisoItem(AisoIter *iter, AISO_TYPE *value)
			
 
				+		yields the next value in the iterator; fails if exhausted
			
 
				+
			
 
				+	void CloseIter(AisoIter *iter)
			
 
				+		closes the iterator
			
 
				+
			
 
				+	If AISO_DEBUG is defined the module will also supply:
			
 
				+	void PrintAisoTree(void)
			
 
				+		prints the AISO tree; requires AISO_FORMAT, to be set to
			
 
				+		a format suitable to print a value of type AISO_TYPE
			
 
				+
			
 
				+Implementation:
			
 
				+	The AISO implementation is based on a self-adjusting binary tree.
			
 
				+	Degenerate behaviour of the tree is avoided by shaking the tree
			
 
				+	every 'ln aiso_size' node accesses.  This guarantees ln aiso_size
			
 
				+	behaviour in the long run, though it is possible for a single
			
 
				+	operation to take aiso_size node accesses.
			
 
				+
			
 
				+	The iterator is implemented as an additional linear linked list
			
 
				+	through the tree.  This is simpler than and at least as efficient as
			
 
				+	clever tree-wiring.
			
 
				+
			
 
				+Restrictions:
			
 
				+	Due to built-in fixed names, there can only be one AISO per program.
			
 
				+*/
			
 
				+
			
 
				+struct aiso_node {
			
 
				+	struct aiso_node *an_left;
			
 
				+	struct aiso_node *an_right;
			
 
				+#ifdef	AISO_ITERATOR
			
 
				+	struct aiso_node *an_next;
			
 
				+#endif	/* AISO_ITERATOR */
			
 
				+	AISO_TYPE an_value;
			
 
				+};
			
 
				+
			
 
				+extern int InsertAiso(AISO_TYPE value);
			
 
				+#ifdef	AISO_EXTRACTOR
			
 
				+extern int ExtractAiso(AISO_TYPE *value);
			
 
				+#endif	/* AISO_EXTRACTOR */
			
 
				+
			
 
				+#ifdef	AISO_ITERATOR
			
 
				+typedef	struct aiso_node *AisoIter;
			
 
				+extern void OpenIter(AisoIter *iter);
			
 
				+extern int GetAisoItem(AisoIter *iter, AISO_TYPE *value);
			
 
				+extern void CloseIter(AisoIter *iter);
			
 
				+#endif	/* AISO_ITERATOR */
			
 
				+
			
 
				+#ifdef	AISO_DEBUG
			
 
				+extern void PrintAisoTree(void);
			
 
				+#endif	/* AISO_ITERATOR */
			
--- a/utils/sim_pasc/algollike.c
+++ b/utils/sim_pasc/algollike.c
@@ -0,0 +1,135 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: algollike.c,v 2.4 2005/02/20 17:02:59 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	This module implements the routines InitLanguage, MayBeStartOfRun
			
 
				+	and CheckRun for ALGOL-like languages, in which it is meaningful
			
 
				+	and useful to isolate function bodies.
			
 
				+
			
 
				+	It requires the user to define, preferably in Xlang.l, four token
			
 
				+	sets, represented as TOKEN[] and terminated by NOTOKEN:
			
 
				+
			
 
				+	TOKEN NonFinals[]	tokens that may not end a chunk
			
 
				+	TOKEN NonInitials[]	tokens that may not start a chunk
			
 
				+	TOKEN Openers[]		openers of parentheses that must balance
			
 
				+					in functions
			
 
				+	TOKEN Closers[]		the corresponding closers, in the same order
			
 
				+*/
			
 
				+
			
 
				+#include	"options.h"
			
 
				+#include	"token.h"
			
 
				+#include	"algollike.h"
			
 
				+
			
 
				+/*	Arrays for fast identification tests for tokens.  Each token is
			
 
				+	identified by its position in the set + 1.  For example, if T is
			
 
				+	the n-th Opener, openers[TOKEN2int(tk)] == n+1.
			
 
				+*/
			
 
				+static char non_finals[256];
			
 
				+static char non_initials[256];
			
 
				+static char openers[256];
			
 
				+static char closers[256];
			
 
				+
			
 
				+static void cvt2bittable(const TOKEN *tl, char bt[256]);
			
 
				+static unsigned int largest_function(const TOKEN *str, unsigned int size);
			
 
				+
			
 
				+void
			
 
				+InitLanguage(void) {
			
 
				+	/* convert the token sets to bitmaps */
			
 
				+	cvt2bittable(NonFinals, non_finals);
			
 
				+	cvt2bittable(NonInitials, non_initials);
			
 
				+	cvt2bittable(Openers, openers);
			
 
				+	cvt2bittable(Closers, closers);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+cvt2bittable(const TOKEN *tl, char bt[256]) {
			
 
				+	int i;
			
 
				+	int cnt = 1;
			
 
				+
			
 
				+	for (i = 0; !TOKEN_EQ(tl[i], NOTOKEN); i++) {
			
 
				+		bt[TOKEN2int(tl[i])] = cnt++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+MayBeStartOfRun(TOKEN tk) {
			
 
				+	return !non_initials[TOKEN2int(tk)];
			
 
				+}
			
 
				+
			
 
				+unsigned int
			
 
				+CheckRun(const TOKEN *str, unsigned int size) {
			
 
				+	/*	Checks the run starting at str with length size for
			
 
				+		acceptability in the language.  Cuts from the end if
			
 
				+		necessary and returns the accepted length, which may
			
 
				+		be zero.
			
 
				+	*/
			
 
				+
			
 
				+	if (option_set('f')) {
			
 
				+		/* reduce to a function-like form first */
			
 
				+		size = largest_function(str, size);
			
 
				+	}
			
 
				+
			
 
				+	while (	/* there is trailing garbage */
			
 
				+		size != 0 && non_finals[TOKEN2int(str[size-1])]
			
 
				+	) {
			
 
				+		/* remove it */
			
 
				+		size--;
			
 
				+	}
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+static unsigned int
			
 
				+largest_function(const TOKEN *str, unsigned int size) {
			
 
				+	/*	Returns the size of the longest sequence starting at
			
 
				+		str[0] and not containing unbalanced parentheses.
			
 
				+		Does not check the nesting of the parentheses, but then,
			
 
				+		sim is syntax-free anyway.
			
 
				+	*/
			
 
				+	register unsigned int mrb_size = 0;  /* most recent balancing size */
			
 
				+	register unsigned int pos;
			
 
				+	register int i;
			
 
				+	int balance_count[256];
			
 
				+	int n_imbalances;
			
 
				+
			
 
				+	/* clear administration */
			
 
				+	n_imbalances = 0;
			
 
				+	for (i = 0; i < 255; i++) {
			
 
				+		balance_count[i] = 0;
			
 
				+	}
			
 
				+
			
 
				+	/* scan str[] and see how far we get */
			
 
				+	for (pos = 0; pos < size; pos++) {
			
 
				+		register int tkval = TOKEN2int(str[pos]);
			
 
				+		register int pp;		/* parenthesis position */
			
 
				+
			
 
				+		/* account for openers */
			
 
				+		if ((pp = openers[tkval])) {
			
 
				+			if (balance_count[pp] == 0) {
			
 
				+				/* about to create an imbalance */
			
 
				+				n_imbalances++;
			
 
				+			}
			
 
				+			balance_count[pp]++;
			
 
				+		}
			
 
				+
			
 
				+		/* account for closers */
			
 
				+		if ((pp = closers[tkval])) {
			
 
				+			if (balance_count[pp] == 0) {
			
 
				+				/* this is one Closer too many */
			
 
				+				return mrb_size;
			
 
				+			}
			
 
				+			balance_count[pp]--;
			
 
				+			if (balance_count[pp] == 0) {
			
 
				+				/* we just cleared an imbalance */
			
 
				+				n_imbalances--;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (n_imbalances == 0) {
			
 
				+			/* register balance point */
			
 
				+			mrb_size = pos + 1;
			
 
				+		}
			
 
				+	}
			
 
				+	return mrb_size;
			
 
				+}
			
--- a/utils/sim_pasc/algollike.h
+++ b/utils/sim_pasc/algollike.h
@@ -0,0 +1,27 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: algollike.h,v 1.1 1997/06/20 12:03:11 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	The class Algollike is a subclass of Language.  It implements
			
 
				+	the routines InitLanguage, MayBeStartOfRun and CheckRun for
			
 
				+	ALGOL-like languages, in which it is meaningful and useful to
			
 
				+	isolate function bodies.
			
 
				+
			
 
				+	It requires the user to define, preferably in Xlang.l, four token
			
 
				+	sets, represented as TOKEN[] and terminated by NOTOKEN:
			
 
				+
			
 
				+	TOKEN NonFinals[]	tokens that may not end a chunk
			
 
				+	TOKEN NonInitials[]	tokens that may not start a chunk
			
 
				+	TOKEN Openers[]		openers of parentheses that must balance
			
 
				+					in functions
			
 
				+	TOKEN Closers[]		the corresponding closers, in the same order
			
 
				+*/
			
 
				+
			
 
				+#include	"language.h"
			
 
				+#include	"token.h"
			
 
				+
			
 
				+extern const TOKEN NonFinals[];
			
 
				+extern const TOKEN NonInitials[];
			
 
				+extern const TOKEN Openers[];
			
 
				+extern const TOKEN Closers[];
			
--- a/utils/sim_pasc/clang.l
+++ b/utils/sim_pasc/clang.l
@@ -0,0 +1,252 @@
 
				+%{
			
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: clang.l,v 2.9 2007/08/29 09:10:31 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	C language front end for the similarity tester.
			
 
				+	Author:	Dick Grune <[email protected]>
			
 
				+*/
			
 
				+
			
 
				+#include	"options.h"
			
 
				+#include	"algollike.h"
			
 
				+#include	"token.h"
			
 
				+#include	"idf.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"lang.h"
			
 
				+
			
 
				+/* Language-dependent Code */
			
 
				+
			
 
				+/* Data for module idf */
			
 
				+
			
 
				+static const struct idf ppcmd[] = {
			
 
				+	{"define",	META('d')},
			
 
				+	{"else",	META('e')},
			
 
				+	{"endif",	META('E')},
			
 
				+	{"if",		META('i')},
			
 
				+	{"ifdef",	META('I')},
			
 
				+	{"ifndef",	META('x')},
			
 
				+	{"include",	MTCT('I')},
			
 
				+	{"line",	META('l')},
			
 
				+	{"undef",	META('u')}
			
 
				+};
			
 
				+
			
 
				+static const struct idf reserved[] = {
			
 
				+	{"auto",	NORM('a')},
			
 
				+	{"break",	NORM('b')},
			
 
				+	{"case",	NORM('c')},
			
 
				+	{"char",	NORM('C')},
			
 
				+	{"continue",	CTRL('C')},
			
 
				+	{"default",	NORM('d')},
			
 
				+	{"do",		NORM('D')},
			
 
				+	{"double",	CTRL('D')},
			
 
				+	{"else",	NORM('e')},
			
 
				+	{"enum",	NORM('E')},
			
 
				+	{"extern",	CTRL('E')},
			
 
				+	{"float",	NORM('f')},
			
 
				+	{"for",		NORM('F')},
			
 
				+	{"goto",	NORM('g')},
			
 
				+	{"if",		NORM('i')},
			
 
				+	{"int",		NORM('I')},
			
 
				+	{"long",	NORM('l')},
			
 
				+	{"register",	SKIP},
			
 
				+	{"return",	NORM('r')},
			
 
				+	{"short",	NORM('s')},
			
 
				+	{"sizeof",	NORM('S')},
			
 
				+	{"static",	CTRL('S')},
			
 
				+	{"struct",	META('s')},
			
 
				+	{"switch",	META('S')},
			
 
				+	{"typedef",	NORM('t')},
			
 
				+	{"union",	NORM('u')},
			
 
				+	{"unsigned",	NORM('U')},
			
 
				+	{"void",	SKIP},
			
 
				+	{"while",	NORM('w')}
			
 
				+};
			
 
				+
			
 
				+/* Special treatment of identifiers */
			
 
				+
			
 
				+static TOKEN
			
 
				+idf2token(int hashing) {
			
 
				+	register TOKEN tk;
			
 
				+
			
 
				+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
			
 
				+	if (TOKEN_EQ(tk, IDF) && hashing) {
			
 
				+		/* return a one-token hash code */
			
 
				+		tk = idf_hashed(yytext);
			
 
				+	}
			
 
				+	return tk;
			
 
				+}
			
 
				+
			
 
				+/* Token sets for module algollike */
			
 
				+const TOKEN NonFinals[] = {
			
 
				+	IDF,		/* identifier */
			
 
				+	NORM('{'),
			
 
				+	NORM('('),
			
 
				+	NORM('a'),	/* auto */
			
 
				+	NORM('b'),	/* break */
			
 
				+	NORM('c'),	/* case */
			
 
				+	NORM('C'),	/* char */
			
 
				+	CTRL('C'),	/* continue */
			
 
				+	NORM('d'),	/* default */
			
 
				+	NORM('D'),	/* do */
			
 
				+	CTRL('D'),	/* double */
			
 
				+	NORM('E'),	/* enum */
			
 
				+	CTRL('E'),	/* extern */
			
 
				+	NORM('f'),	/* float */
			
 
				+	NORM('F'),	/* for */
			
 
				+	NORM('g'),	/* goto */
			
 
				+	NORM('i'),	/* if */
			
 
				+	NORM('I'),	/* int */
			
 
				+	NORM('l'),	/* long */
			
 
				+	NORM('r'),	/* return */
			
 
				+	NORM('s'),	/* short */
			
 
				+	CTRL('S'),	/* static */
			
 
				+	META('s'),	/* struct */
			
 
				+	META('S'),	/* switch */
			
 
				+	NORM('t'),	/* typedef */
			
 
				+	NORM('u'),	/* union */
			
 
				+	NORM('U'),	/* unsigned */
			
 
				+	NORM('w'),	/* while */
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN NonInitials[] = {
			
 
				+	NORM(')'),
			
 
				+	NORM('}'),
			
 
				+	NORM(';'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Openers[] = {
			
 
				+	NORM('{'),
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Closers[] = {
			
 
				+	NORM('}'),
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+
			
 
				+%}
			
 
				+
			
 
				+%option nounput
			
 
				+%option never-interactive
			
 
				+
			
 
				+%Start	Comment
			
 
				+
			
 
				+Layout		([ \t\r\f])
			
 
				+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
			
 
				+
			
 
				+AnyQuoted	(\\.)
			
 
				+StrChar		([^"\n\\]|{AnyQuoted})
			
 
				+ChrChar		([^'\n\\]|{AnyQuoted})
			
 
				+
			
 
				+StartComment	("/*")
			
 
				+EndComment	("*/")
			
 
				+SafeComChar	([^*\n])
			
 
				+UnsafeComChar	("*")
			
 
				+
			
 
				+Digit		([0-9a-fA-F])
			
 
				+Idf		([A-Za-z][A-Za-z0-9_]*)
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+{StartComment}	{
			
 
				+		/*	We do not have one single pattern to match a comment
			
 
				+			(although one can be written), for two reasons.
			
 
				+			The matched string might overflow lex-internal buffers
			
 
				+			like yysbuf and yytext; and the pattern would be very
			
 
				+			complicated and overtax lex.
			
 
				+			So we break up the string into safe chunks and keep
			
 
				+			track of where we are in a start condition <Comment>.
			
 
				+		*/
			
 
				+		BEGIN Comment;
			
 
				+	}
			
 
				+
			
 
				+<Comment>{SafeComChar}+	{		/* safe comment chunk */
			
 
				+	}
			
 
				+
			
 
				+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
			
 
				+	}
			
 
				+
			
 
				+<Comment>"\n"		{		/* to break up long comments */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+<Comment>{EndComment}	{		/* end-of-comment */
			
 
				+		BEGIN INITIAL;
			
 
				+	}
			
 
				+
			
 
				+\"{StrChar}*\"	{			/* strings */
			
 
				+		return_ch('"');
			
 
				+	}
			
 
				+
			
 
				+\'{ChrChar}+\'	{			/* characters */
			
 
				+		return_ch('\'');
			
 
				+	}
			
 
				+
			
 
				+^#{Layout}*include.*	{		/* ignore #include lines */
			
 
				+	}
			
 
				+
			
 
				+^#{Layout}*{Idf}	{		/* a preprocessor line */
			
 
				+		register char *idf = yytext+1;
			
 
				+
			
 
				+		/* skip layout in front of preprocessor identifier */
			
 
				+		while (*idf == ' ' || *idf == '\t') {
			
 
				+			idf++;
			
 
				+		}
			
 
				+		return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#')));
			
 
				+	}
			
 
				+
			
 
				+(0x)?{Digit}+("l"|"L")?	{		/* numeral, passed as an identifier */
			
 
				+		return_tk(IDF);
			
 
				+	}
			
 
				+
			
 
				+{Idf}/"("	{			/* identifier in front of ( */
			
 
				+		register TOKEN tk;
			
 
				+
			
 
				+		tk = idf2token(option_set('F'));
			
 
				+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
			
 
				+	}
			
 
				+
			
 
				+{Idf}	{				/* identifier */
			
 
				+		register TOKEN tk;
			
 
				+
			
 
				+		tk = idf2token(0 /* no hashing */);
			
 
				+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
			
 
				+	}
			
 
				+
			
 
				+\;	{				/* semicolon, conditionally ignored */
			
 
				+		if (option_set('f')) return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+\n	{				/* count newlines */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+{Layout}	{			/* ignore layout */
			
 
				+	}
			
 
				+
			
 
				+{ASCII95}	{			/* copy other text */
			
 
				+		return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+.	{				/* count non-ASCII chars */
			
 
				+		lex_non_ascii_cnt++;
			
 
				+	}
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+/* Language-INdependent Code */
			
 
				+
			
 
				+void
			
 
				+yystart(void) {
			
 
				+	BEGIN INITIAL;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+yywrap(void) {
			
 
				+	return 1;
			
 
				+}
			
--- a/utils/sim_pasc/compare.c
+++ b/utils/sim_pasc/compare.c
@@ -0,0 +1,198 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: compare.c,v 2.5 2001/09/28 09:03:47 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	"sim.h"
			
 
				+#include	"tokenarray.h"
			
 
				+#include	"hash.h"
			
 
				+#include	"language.h"
			
 
				+#include	"options.h"
			
 
				+#include	"add_run.h"
			
 
				+#include	"compare.h"
			
 
				+
			
 
				+static void compare1text(int, int, int);
			
 
				+static unsigned int lcs(
			
 
				+	struct text *, unsigned int, struct text **, unsigned int *,
			
 
				+	unsigned int, unsigned int
			
 
				+);
			
 
				+
			
 
				+/*	The overall structure of the routine Compare() is:
			
 
				+
			
 
				+	for all new files
			
 
				+		for all texts it must be compared to
			
 
				+			for all positions in the new file
			
 
				+				for all positions in the text
			
 
				+					for ever increasing sizes
			
 
				+						try to match and keep the best
			
 
				+*/
			
 
				+
			
 
				+void
			
 
				+Compare(void) {
			
 
				+	register int n;
			
 
				+
			
 
				+	for (n = 0; n < NumberOfNewTexts; n++) {
			
 
				+		register int first =
			
 
				+			(	option_set('S') ? NumberOfNewTexts + 1
			
 
				+			:	option_set('s') ? n + 1
			
 
				+			:	n
			
 
				+			);
			
 
				+
			
 
				+		if (option_set('e')) {
			
 
				+			/* from first to NumberOfTexts in steps */
			
 
				+			register int m;
			
 
				+
			
 
				+			for (m = first; m < NumberOfTexts; m++) {
			
 
				+				compare1text(n, m, m+1);
			
 
				+			}
			
 
				+		}
			
 
				+		else {
			
 
				+			/* from first to NumberOfTexts in one action */
			
 
				+			if (first < NumberOfTexts) {
			
 
				+				compare1text(n, first, NumberOfTexts);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+compare1text(
			
 
				+	int n,				/* text to be compared */
			
 
				+	int first,			/* first text to be compared to */
			
 
				+	int limit			/* limit text in comparison */
			
 
				+) {
			
 
				+	register unsigned int i_first = Text[first].tx_start;
			
 
				+	register unsigned int i_limit = Text[limit-1].tx_limit;
			
 
				+	register struct text *txt0 = &Text[n];
			
 
				+	register unsigned int i0 = txt0->tx_start;
			
 
				+
			
 
				+	while (	/* there may still be a useful substring */
			
 
				+		i0 + MinRunSize - 1 < txt0->tx_limit
			
 
				+	) {
			
 
				+		/* see if there really is one */
			
 
				+		struct text *txt_best;
			
 
				+		unsigned int i_best;
			
 
				+		register unsigned int size_best =
			
 
				+			lcs(txt0, i0, &txt_best, &i_best, i_first, i_limit);
			
 
				+
			
 
				+		if (size_best) {
			
 
				+			/* good run found; enter it */
			
 
				+			add_run(txt0, i0, txt_best, i_best, size_best);
			
 
				+			/* and skip it */
			
 
				+			i0 += size_best;
			
 
				+		}
			
 
				+		else {
			
 
				+			/* we try our luck at the next token */
			
 
				+			i0++;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned int
			
 
				+lcs(	struct text *txt0,		/* input: starting position */
			
 
				+	unsigned int i0,
			
 
				+	struct text **tbp,		/* output: position of best run */
			
 
				+	unsigned int *ibp,
			
 
				+	unsigned int i_first,		/* no comparison before this pos. */
			
 
				+	unsigned int i_limit		/* no comparison after this pos. */
			
 
				+) {
			
 
				+	/*	Finds the longest common substring (not -sequence) in:
			
 
				+			txt0, starting precisely at i0 and
			
 
				+			the text between i_first and i_limit.
			
 
				+		Writes the position in tbp and ibp and returns the size.
			
 
				+		Returns 0 if no common substring is found.
			
 
				+	*/
			
 
				+	register struct text *txt1 = txt0;
			
 
				+	register unsigned int i1 = i0;
			
 
				+	register unsigned int size_best = 0;
			
 
				+	register unsigned int txt0limit = txt0->tx_limit;
			
 
				+	register unsigned int txt1limit = txt1->tx_limit;
			
 
				+
			
 
				+	while (	/* there is a next opportunity */
			
 
				+		(i1 = ForwardReference(i1))
			
 
				+	&&	/* it is still in range */
			
 
				+		i1 < i_limit
			
 
				+	) {
			
 
				+		register unsigned int min_size;
			
 
				+		register unsigned int new_size;
			
 
				+		register unsigned int j0;
			
 
				+		register unsigned int j1;
			
 
				+
			
 
				+		if (i1 < i_first) {	/* not in range */
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		/* bump txt1; we may have skipped a text or two */
			
 
				+		while (i1 >= txt1->tx_limit) {
			
 
				+			txt1++;
			
 
				+		}
			
 
				+		txt1limit = txt1->tx_limit;
			
 
				+
			
 
				+		min_size = (size_best ? size_best+1 : MinRunSize);
			
 
				+		/* are we looking at something better than we have got? */
			
 
				+		{
			
 
				+			j0 = i0 + min_size - 1;
			
 
				+			j1 = i1 + min_size - 1;
			
 
				+			if (	/* j0 still inside txt0 */
			
 
				+				j0 < txt0limit
			
 
				+			&&	/* j1 still inside txt1 */
			
 
				+				j1 < txt1limit
			
 
				+			&&	/* j0 and j1 don't overlap */
			
 
				+				j0 < j1 - min_size + 1
			
 
				+			) {
			
 
				+				/* there would be room enough */
			
 
				+				register int cnt = min_size;
			
 
				+
			
 
				+				/* does the text match? */
			
 
				+				while (	cnt
			
 
				+				&&	TOKEN_EQ(TokenArray[j0], TokenArray[j1])
			
 
				+				) {
			
 
				+					cnt--, j0--, j1--;
			
 
				+				}
			
 
				+				if (cnt) continue;	/* forget it */
			
 
				+			}
			
 
				+			else continue;			/* forget it */
			
 
				+		}
			
 
				+
			
 
				+		/* yes, we are; how long can we make it? */
			
 
				+		{
			
 
				+			register unsigned int size = min_size;
			
 
				+
			
 
				+			j0 = i0 + min_size;
			
 
				+			j1 = i1 + min_size;
			
 
				+			while (	/* j0 still inside txt0 */
			
 
				+				j0 < txt0limit
			
 
				+			&&	/* j1 still inside txt1 */
			
 
				+				j1 < txt1limit
			
 
				+			&&	/* j0 and j1 don't overlap */
			
 
				+				j0 + size < j1
			
 
				+			&&	/* tokens are the same */
			
 
				+				TOKEN_EQ(TokenArray[j0], TokenArray[j1])
			
 
				+			) {
			
 
				+				j0++, j1++, size++;
			
 
				+			}
			
 
				+			new_size = size;
			
 
				+		}
			
 
				+
			
 
				+		/*	offer the run to the Language Department which may
			
 
				+			reject it or may cut its tail
			
 
				+		*/
			
 
				+		new_size = (	MayBeStartOfRun(TokenArray[i0])
			
 
				+			   ?	CheckRun(&TokenArray[i0], new_size)
			
 
				+			   :	0
			
 
				+			   );
			
 
				+
			
 
				+		if (	/* we still have something acceptable */
			
 
				+			new_size >= MinRunSize
			
 
				+		&&	/* it is better still than what we had */
			
 
				+			new_size > size_best
			
 
				+		) {
			
 
				+			/* record it */
			
 
				+			*tbp = txt1;
			
 
				+			*ibp = i1;
			
 
				+			size_best = new_size;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return size_best;
			
 
				+}
			
--- a/utils/sim_pasc/compare.h
+++ b/utils/sim_pasc/compare.h
@@ -0,0 +1,11 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: compare.h,v 1.2 1998/01/21 14:27:47 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Compares each new text to the appropriate texts.
			
 
				+	Stores the runs found in the AISO heap.
			
 
				+	Runs contain references to positions in the input files.
			
 
				+*/
			
 
				+
			
 
				+extern void Compare(void);
			
--- a/utils/sim_pasc/debug.par
+++ b/utils/sim_pasc/debug.par
@@ -0,0 +1,20 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: debug.par,v 1.3 1998/02/03 14:28:21 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#undef	DB_FORW_REF			/* print & check forward references */
			
 
				+#undef	DB_TEXT				/* print all text parts */
			
 
				+#undef	DB_POS				/* print positions in files */
			
 
				+#undef	DB_NL_BUFF			/* print the newline count buffer */
			
 
				+#undef	DB_RUN				/* print all identified runs */
			
 
				+
			
 
				+#ifdef	lint
			
 
				+
			
 
				+#define	DB_FORW_REF
			
 
				+#define	DB_TEXT
			
 
				+#define	DB_POS
			
 
				+#define	DB_NL_BUFF
			
 
				+#define	DB_RUN
			
 
				+
			
 
				+#endif	/* lint */
			
--- a/utils/sim_pasc/error.c
+++ b/utils/sim_pasc/error.c
@@ -0,0 +1,16 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: error.c,v 2.4 1998/02/03 14:28:22 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<stdlib.h>
			
 
				+
			
 
				+#include	"sim.h"
			
 
				+#include	"error.h"
			
 
				+
			
 
				+void
			
 
				+fatal(const char *msg) {
			
 
				+	fprintf(stderr, "%s: %s\n", progname, msg);
			
 
				+	exit(1);
			
 
				+}
			
--- a/utils/sim_pasc/error.h
+++ b/utils/sim_pasc/error.h
@@ -0,0 +1,6 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: error.h,v 1.3 1998/02/03 14:28:23 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+extern void fatal(const char *msg);
			
--- a/utils/sim_pasc/hash.c
+++ b/utils/sim_pasc/hash.c
@@ -0,0 +1,386 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: hash.c,v 2.8 2005/02/20 17:03:00 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Text is compared by comparing every substring to all substrings
			
 
				+	to the right of it; this process is in essence quadratic.  However,
			
 
				+	only substrings of length at least 'MinRunSize' are of interest,
			
 
				+	which gives us the possibility to speed up this process by using
			
 
				+	a hash table.
			
 
				+
			
 
				+	For every position in the text, we construct an index which gives
			
 
				+	the next position in the text at which a run of MinRunSize tokens
			
 
				+	starts that has the same hash code, as calculated by hash1().  If
			
 
				+	there is no such run, the index is 0.  These forward references are
			
 
				+	kept in the array forward_references[].
			
 
				+
			
 
				+	To construct this array, we use a hash table last_index[] whose size
			
 
				+	is a prime and which is about 8 times smaller than the text array.
			
 
				+	The hash table last_index[] is set up such that last_index[i] is the
			
 
				+	index of the latest token with hash_code i, or 0 if there is none.
			
 
				+	This results in hash chains of an average length of 8.  See
			
 
				+	MakeForwardReferences().
			
 
				+
			
 
				+	If there is not enough room for a hash table of the proper size
			
 
				+	(which can be considerable) the hashing is not efficient any more.
			
 
				+	In that case, the forward reference table is scanned a second time,
			
 
				+	eliminating from any chain all references to runs that do not hash to
			
 
				+	the same value under a second hash function, hash2().  For the UNIX
			
 
				+	manuals this reduced the number of matches from 91.9% to 1.9% (of
			
 
				+	which 0.06% was genuine).
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<malloc.h>
			
 
				+
			
 
				+#include	"system.par"
			
 
				+#include	"debug.par"
			
 
				+#include	"sim.h"
			
 
				+#include	"error.h"
			
 
				+#include	"language.h"
			
 
				+#include	"token.h"
			
 
				+#include	"tokenarray.h"
			
 
				+#include	"options.h"
			
 
				+#include	"hash.h"
			
 
				+
			
 
				+							/* MAIN ENTRIES */
			
 
				+static unsigned int *forward_references;	/* to be filled by malloc() */
			
 
				+static int n_forward_references;
			
 
				+
			
 
				+static void make_forward_references_hash1(void);
			
 
				+static void make_forward_references_hash2(void);
			
 
				+
			
 
				+#ifdef	DB_FORW_REF
			
 
				+static void db_forward_references(const char *);
			
 
				+static void make_forward_references_hash3(void);
			
 
				+#endif
			
 
				+
			
 
				+void
			
 
				+MakeForwardReferences(void) {
			
 
				+	/*	Constructs the forward references table.
			
 
				+	*/
			
 
				+
			
 
				+	n_forward_references = TextLength();
			
 
				+	forward_references =
			
 
				+		(unsigned int *)calloc(
			
 
				+			n_forward_references, sizeof (unsigned int)
			
 
				+		);
			
 
				+	if (!forward_references) {
			
 
				+		fatal("out of memory");
			
 
				+	}
			
 
				+	make_forward_references_hash1();
			
 
				+	make_forward_references_hash2();
			
 
				+#ifdef	DB_FORW_REF
			
 
				+	make_forward_references_hash3();
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+unsigned int
			
 
				+ForwardReference(int i) {
			
 
				+	if (i <= 0 || i >= n_forward_references) {
			
 
				+		fatal("internal error, bad forward reference");
			
 
				+	}
			
 
				+	return forward_references[i];
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+FreeForwardReferences(void) {
			
 
				+	free((char *)forward_references);
			
 
				+}
			
 
				+
			
 
				+							/* HASHING */
			
 
				+/*
			
 
				+	We want a hash function whose time cost does not depend on
			
 
				+	MinRunSize, which is a problem since the size of the value
			
 
				+	we derive the hash function from IS equal to MinRunSize!
			
 
				+	Therefore we base the hash function on a sample of at most 24
			
 
				+	tokens from the input string; this works at least as well in
			
 
				+	practice.  These 24 token values will result in exactly 31
			
 
				+	bits under the hashing algorithm used, which avoids an
			
 
				+	overflow test.  So this 24 bears no relation to the default
			
 
				+	run size (although the fit is surprising!)
			
 
				+*/
			
 
				+
			
 
				+#define	N_SAMPLES	24
			
 
				+#define	OPERATION	^
			
 
				+
			
 
				+/*	An alternative algorithm; does not seem to make any difference.
			
 
				+#define	N_SAMPLES	23
			
 
				+#define	OPERATION	+
			
 
				+*/
			
 
				+
			
 
				+/*	Another algorithm; not yet tested
			
 
				+#define	N_SAMPLES	24
			
 
				+#define	OPERATION	+ 613 *
			
 
				+*/
			
 
				+
			
 
				+static unsigned int *last_index;
			
 
				+static unsigned int hash_table_size;
			
 
				+static int sample_pos[N_SAMPLES];
			
 
				+
			
 
				+static unsigned int
			
 
				+prime[] = {		/* lots of hopefully suitable primes */
			
 
				+	10639,
			
 
				+	21283,
			
 
				+	42571,
			
 
				+	85147,
			
 
				+	170227,
			
 
				+	340451,
			
 
				+	680959,
			
 
				+	1361803,
			
 
				+	2723599,
			
 
				+	5447171,
			
 
				+	10894379,
			
 
				+	21788719,
			
 
				+	43577399,
			
 
				+	87154759,
			
 
				+	174309383,
			
 
				+	348618827,
			
 
				+	697237511,
			
 
				+	1394475011
			
 
				+};
			
 
				+
			
 
				+static void
			
 
				+init_hash_table(void) {
			
 
				+	register int n;
			
 
				+
			
 
				+	/* find the ideal hash table size */
			
 
				+	n = 0;
			
 
				+	while (prime[n] < TextLength()) {
			
 
				+		n++;
			
 
				+		/* this will always terminate, if prime[] is large enough */
			
 
				+	}
			
 
				+
			
 
				+	/* see if we can allocate that much space, and if not, step down */
			
 
				+	last_index = 0;
			
 
				+	while (!last_index && n >= 0) {
			
 
				+		hash_table_size = prime[n];
			
 
				+		last_index = (unsigned int *)
			
 
				+			calloc(hash_table_size, sizeof (unsigned int));
			
 
				+		n--;
			
 
				+	}
			
 
				+	if (!last_index) {
			
 
				+		fatal("out of memory");
			
 
				+	}
			
 
				+	
			
 
				+	/* find sample positions */
			
 
				+	for (n = 0; n < N_SAMPLES; n++) {
			
 
				+		/* straigh-line approximation; uninituitive as usual */
			
 
				+		sample_pos[n] = (
			
 
				+			(2 * n * (MinRunSize - 1) + (N_SAMPLES - 1))
			
 
				+		/	(2 * (N_SAMPLES - 1))
			
 
				+		);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int hash1(const TOKEN *);
			
 
				+
			
 
				+static void
			
 
				+make_forward_references_hash1(void) {
			
 
				+	register int n;
			
 
				+
			
 
				+	init_hash_table();
			
 
				+
			
 
				+	/* set up the forward references using the last_index hash table */
			
 
				+	for (n = 0; n < NumberOfTexts; n++) {
			
 
				+		register struct text *txt = &Text[n];
			
 
				+		register unsigned int j;
			
 
				+
			
 
				+		for (	/* all pos'ns in txt except the last MinRunSize-1 */
			
 
				+			j = txt->tx_start;			/* >= 1 */
			
 
				+			j + MinRunSize - 1 < txt->tx_limit;
			
 
				+			j++
			
 
				+		) {
			
 
				+			if (MayBeStartOfRun(TokenArray[j])) {
			
 
				+				register int h = hash1(&TokenArray[j]);
			
 
				+
			
 
				+				if (last_index[h]) {
			
 
				+					forward_references[last_index[h]] = j;
			
 
				+				}
			
 
				+				last_index[h] = j;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	free((char *)last_index);
			
 
				+
			
 
				+#ifdef	DB_FORW_REF
			
 
				+	db_forward_references("first hashing");
			
 
				+#endif	/* DB_FORW_REF */
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+hash1(const TOKEN *p) {
			
 
				+	/*	hash1(p) returns the hash code of the MinRunSize
			
 
				+		tokens starting at p; caller guarantees that there
			
 
				+		are at least MinRunSize tokens.
			
 
				+	*/
			
 
				+	register int32 h_val;
			
 
				+	register int n;
			
 
				+	
			
 
				+	h_val = 0;
			
 
				+	for (n = 0; n < N_SAMPLES; n++) {
			
 
				+		h_val = (h_val << 1) OPERATION TOKEN2int(p[sample_pos[n]]);
			
 
				+#if	N_SAMPLES > 24
			
 
				+		if (h_val & (1<<31)) {
			
 
				+			h_val ^= (1<<31|1);
			
 
				+		}
			
 
				+#endif
			
 
				+	}
			
 
				+	/* just in case somebody tries wrong N_SAMPLES and OPERATION values: */
			
 
				+	if (h_val < 0) fatal("corrupt hash algorithm in hash1() in hash.c");
			
 
				+
			
 
				+	return h_val % hash_table_size;
			
 
				+}
			
 
				+
			
 
				+static int hash2(const TOKEN *);
			
 
				+
			
 
				+static void
			
 
				+make_forward_references_hash2(void) {
			
 
				+	register unsigned int i;
			
 
				+
			
 
				+	/* do a second hash only if the original hash table was reduced */
			
 
				+	/*	Meanwhile, the quality of the primary hashing is so bad
			
 
				+		that we are virtually forced to always do a second scan.
			
 
				+	*/
			
 
				+
			
 
				+	/*	Clean out spurious matches, by a quadratic algorithm.
			
 
				+		Note that we do not want to eliminate overlapping
			
 
				+		sequences in this stage, since we might be removing the
			
 
				+		wrong copy.
			
 
				+	*/
			
 
				+	for (i = 0; i+MinRunSize < TextLength(); i++) {
			
 
				+		register unsigned int j = i;
			
 
				+		register int h2 = hash2(&TokenArray[i]);
			
 
				+
			
 
				+		/*	Find the first token sequence in the chain
			
 
				+			with same secondary hash code.
			
 
				+		*/
			
 
				+		while (	/* there is still a forward reference */
			
 
				+			(j = forward_references[j])
			
 
				+		&&	/* its hash code does not match */
			
 
				+			hash2(&TokenArray[j]) != h2
			
 
				+		) {
			
 
				+			/* continue searching */
			
 
				+		}
			
 
				+		/* short-circuit forward reference to it, or to zero */
			
 
				+		forward_references[i] = j;
			
 
				+	}
			
 
				+
			
 
				+#ifdef	DB_FORW_REF
			
 
				+	db_forward_references("second hashing");
			
 
				+#endif	/* DB_FORW_REF */
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+hash2(const TOKEN *p) {
			
 
				+	/*	a simple-minded hashing for the secondary sweep;
			
 
				+		first and last token combined in a short int
			
 
				+	*/
			
 
				+	return (TOKEN2int(p[0]) << 8) + TOKEN2int(p[MinRunSize-1]);
			
 
				+}
			
 
				+
			
 
				+#ifdef	DB_FORW_REF
			
 
				+
			
 
				+static int hash3(const TOKEN *, const TOKEN *);
			
 
				+
			
 
				+static void
			
 
				+make_forward_references_hash3(void) {
			
 
				+	register unsigned int i;
			
 
				+
			
 
				+	/* do a third hash to check up on the previous two */
			
 
				+
			
 
				+	/* this time we use a genuine compare */
			
 
				+	for (i = 0; i+MinRunSize < TextLength(); i++) {
			
 
				+		register unsigned int j = i;
			
 
				+
			
 
				+		while (	/* there is still a forward reference */
			
 
				+			(j = forward_references[j])
			
 
				+		&&	/* its hash code does not match */
			
 
				+			!hash3(&TokenArray[i], &TokenArray[j])
			
 
				+		) {
			
 
				+			/* continue searching */
			
 
				+		}
			
 
				+		/* short-circuit forward reference to it, or to zero */
			
 
				+		forward_references[i] = j;
			
 
				+	}
			
 
				+
			
 
				+	db_forward_references("third hashing");
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+hash3(const TOKEN *p, const TOKEN *q) {
			
 
				+	/* a full comparison for the tertiary sweep */
			
 
				+	int n;
			
 
				+	
			
 
				+	for (n = 0; n < MinRunSize; n++) {
			
 
				+		if (TOKEN2int(*(p+n)) != TOKEN2int(*(q+n))) return 0;
			
 
				+	}
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+db_frw_chain(int n, char *crossed_out) {
			
 
				+	register int chain_len = -1;
			
 
				+		/* if there are two values, the chain length is still 1 */
			
 
				+	register int fw;
			
 
				+
			
 
				+	for (fw = n; fw; fw = forward_references[fw]) {
			
 
				+		if (crossed_out[fw]) {
			
 
				+			fprintf(DebugFile,
			
 
				+				">>>> error in forward_references[] <<<<\n"
			
 
				+			);
			
 
				+		}
			
 
				+		chain_len++;
			
 
				+		crossed_out[fw]++;
			
 
				+	}
			
 
				+	fprintf(DebugFile, "n = %d, chain_len = %d\n", n, chain_len);
			
 
				+	
			
 
				+	return chain_len;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+db_forward_references(const char *msg) {
			
 
				+	int n;
			
 
				+	int n_frw_chains = 0;		/* number of forward ref. chains */
			
 
				+	int tot_frwc_len = 0;
			
 
				+	char *crossed_out;
			
 
				+
			
 
				+	fprintf(DebugFile, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg);
			
 
				+	fprintf(DebugFile, "hash_table_size = %u\n", hash_table_size);
			
 
				+	fprintf(DebugFile, "N_SAMPLES = %d\n", N_SAMPLES);
			
 
				+
			
 
				+	crossed_out = (char *)calloc(TextLength(), sizeof (char));
			
 
				+	if (!crossed_out) {
			
 
				+		fatal(">>>> no room for db_forward_references debug table <<<<\n");
			
 
				+	}
			
 
				+
			
 
				+	/*	Each forward_references[n] starts in principle a new
			
 
				+		chain, and these chains never touch each other.
			
 
				+		We check this property by marking the positions in each
			
 
				+		chain in an array; if we meet a marked entry while
			
 
				+		following a chain, it must have been on an earlier chain
			
 
				+		and we have an error.
			
 
				+		We also determine the lengths of the chains, for statistics.
			
 
				+	*/
			
 
				+	if (forward_references[0]) {
			
 
				+		fprintf(DebugFile,
			
 
				+			">>>> forward_references[0] is not zero <<<<\n"
			
 
				+		);
			
 
				+	}
			
 
				+	for (n = 1; n < TextLength(); n++) {
			
 
				+		if (forward_references[n] && !crossed_out[n]) {
			
 
				+			/* start of a new chain */
			
 
				+			n_frw_chains++;
			
 
				+			tot_frwc_len += db_frw_chain(n, crossed_out);
			
 
				+		}
			
 
				+	}
			
 
				+	free((char *)crossed_out);
			
 
				+
			
 
				+	fprintf(DebugFile,
			
 
				+		"text length = %u, # forward chains = %d, total frw chain length = %d\n\n",
			
 
				+		TextLength(), n_frw_chains, tot_frwc_len
			
 
				+	);
			
 
				+}
			
 
				+
			
 
				+#endif	/* DB_FORW_REF */
			
--- a/utils/sim_pasc/hash.h
+++ b/utils/sim_pasc/hash.h
@@ -0,0 +1,12 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: hash.h,v 1.1 1997/06/20 12:03:14 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Creating and consulting the ForwardReference array; to speed up
			
 
				+	the Longest Substring Allgorithm.
			
 
				+*/
			
 
				+
			
 
				+extern void MakeForwardReferences(void);
			
 
				+extern void FreeForwardReferences(void);
			
 
				+extern unsigned int ForwardReference(int i);
			
--- a/utils/sim_pasc/idf.c
+++ b/utils/sim_pasc/idf.c
@@ -0,0 +1,67 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: idf.c,v 2.8 2005/02/20 17:03:00 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<string.h>
			
 
				+
			
 
				+#include	"system.par"
			
 
				+#include	"token.h"
			
 
				+#include	"idf.h"
			
 
				+
			
 
				+TOKEN
			
 
				+idf_in_list(
			
 
				+	const char *str,
			
 
				+	const struct idf list[],
			
 
				+	unsigned int listsize,
			
 
				+	TOKEN dflt
			
 
				+) {
			
 
				+	register int first = 0;
			
 
				+	register int last = (listsize / sizeof (struct idf)) - 1;
			
 
				+
			
 
				+	while (first < last) {
			
 
				+		register int middle = (first + last) / 2;
			
 
				+
			
 
				+		if (strcmp(str, list[middle].id_tag) > 0) {
			
 
				+			first = middle + 1;
			
 
				+		}
			
 
				+		else {
			
 
				+			last = middle;
			
 
				+		}
			
 
				+	}
			
 
				+	return (strcmp(str, list[first].id_tag) == 0
			
 
				+	?	list[first].id_tr
			
 
				+	:	dflt
			
 
				+	);
			
 
				+}
			
 
				+
			
 
				+TOKEN
			
 
				+idf_hashed(const char *str) {
			
 
				+	register int32 h = 0;
			
 
				+
			
 
				+	/* let's be careful about ranges; if done wrong it's hard to debug */
			
 
				+	while (*str) {
			
 
				+		/* -1 <= h <= 2^31-1 */
			
 
				+		h = (h << 1) + (*str++&0377);
			
 
				+		/* -2^31 <= h <= 2^31-1 */
			
 
				+		if (h < 0) {
			
 
				+			/* -2^31 <= h <= -1 */
			
 
				+			h += 2147483647;	/* 2^31-1 */
			
 
				+			/* -1 <= h <= 2^31-2 */
			
 
				+		}
			
 
				+		else {
			
 
				+			/* 0 <= h <= 2^31-1 */
			
 
				+		}
			
 
				+		/* -1 <= h <= 2^31-1 */
			
 
				+	}
			
 
				+	/* -1 <= h <= 2^31-1 */
			
 
				+	if (h < 0) {
			
 
				+		/* h = -1 */
			
 
				+		/* a very small chance, but all the same */
			
 
				+		h = 0;
			
 
				+	}
			
 
				+	/* 0 <= h <= 2^31-1 */
			
 
				+	h %= 253;				/* 0 <= h < 253 */
			
 
				+	return NORM(h + 1);			/* 1 <= h < 254 */
			
 
				+	/* this avoids SKIP (0) and EOL (255) */
			
 
				+}
			
--- a/utils/sim_pasc/idf.h
+++ b/utils/sim_pasc/idf.h
@@ -0,0 +1,31 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: idf.h,v 2.5 1998/02/03 14:28:25 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Idf module:
			
 
				+	TOKEN idf_in_list(char *str, struct idf l[], sizeof l, TOKEN dflt);
			
 
				+		looks up a keyword in a list of keywords l, represented as an
			
 
				+		array of struct idf, and returns its translation as a token;
			
 
				+		dflt is returned if the keyword is not found.
			
 
				+	TOKEN idf_hashed(char *str);
			
 
				+		returns a token unequal to SKIP or EOL, derived from the str
			
 
				+		through hashing
			
 
				+	It is assumed that SKIP will be ignored by the user of this module.
			
 
				+*/
			
 
				+
			
 
				+#include	"token.h"
			
 
				+
			
 
				+/* the struct for keywords etc. */
			
 
				+struct idf {
			
 
				+	char *id_tag;	/* an interesting identifier */
			
 
				+	TOKEN id_tr;	/* with its one-token translation */
			
 
				+};
			
 
				+
			
 
				+/* special tokens for the idf module */
			
 
				+#define	SKIP		NORM('\0')
			
 
				+#define	IDF		NORM('@')
			
 
				+
			
 
				+/* public functions */
			
 
				+extern TOKEN idf_in_list(const char *, const struct idf [], unsigned int, TOKEN);
			
 
				+extern TOKEN idf_hashed(const char *);
			
--- a/utils/sim_pasc/javalang.l
+++ b/utils/sim_pasc/javalang.l
@@ -0,0 +1,270 @@
 
				+%{
			
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: javalang.l,v 1.4 2007/08/29 09:10:32 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	Java language front end for the similarity tester.
			
 
				+	Author:	Dick Grune <[email protected]>
			
 
				+*/
			
 
				+
			
 
				+#include	"options.h"
			
 
				+#include	"algollike.h"
			
 
				+#include	"token.h"
			
 
				+#include	"idf.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"lang.h"
			
 
				+
			
 
				+/* Language-dependent Code */
			
 
				+
			
 
				+static const struct idf reserved[] = {
			
 
				+	{"abstract",	NORM('a')},
			
 
				+	{"boolean",	NORM('b')},
			
 
				+	{"break",	NORM('B')},
			
 
				+	{"byte",	CTRL('B')},
			
 
				+	{"case",	NORM('c')},
			
 
				+	{"catch",	NORM('C')},
			
 
				+	{"char",	CTRL('C')},
			
 
				+	{"class",	META('c')},
			
 
				+	{"continue",	META('C')},
			
 
				+	{"default",	NORM('d')},
			
 
				+	{"do",		NORM('D')},
			
 
				+	{"double",	CTRL('D')},
			
 
				+	{"else",	NORM('e')},
			
 
				+	{"extends",	NORM('E')},
			
 
				+	{"false",	NORM('g')},	/* Boolean literal */
			
 
				+	{"final",	NORM('f')},
			
 
				+	{"finally",	NORM('F')},
			
 
				+	{"float",	CTRL('F')},
			
 
				+	{"for",		META('f')},
			
 
				+	{"if",		NORM('i')},
			
 
				+	{"implements",	NORM('I')},
			
 
				+	{"import",	CTRL('I')},
			
 
				+	{"instanceof",	META('i')},
			
 
				+	{"int",		META('I')},
			
 
				+	{"interface",	MTCT('I')},
			
 
				+	{"long",	NORM('l')},
			
 
				+	{"native",	NORM('n')},
			
 
				+	{"new",		NORM('N')},
			
 
				+	{"null",	CTRL('N')},	/* null literal */
			
 
				+	{"package",	NORM('p')},
			
 
				+	{"private",	NORM('P')},
			
 
				+	{"protected",	CTRL('P')},
			
 
				+	{"public",	META('p')},
			
 
				+	{"return",	NORM('r')},
			
 
				+	{"short",	NORM('s')},
			
 
				+	{"static",	NORM('S')},
			
 
				+	{"super",	CTRL('S')},
			
 
				+	{"switch",	META('s')},
			
 
				+	{"synchronized",META('S')},
			
 
				+	{"this",	NORM('t')},
			
 
				+	{"throw",	NORM('T')},
			
 
				+	{"throws",	CTRL('T')},
			
 
				+	{"true",	META('t')},	/* Boolean literal */
			
 
				+	{"void",	NORM('v')},
			
 
				+	{"volatile",	NORM('V')},
			
 
				+	{"while",	NORM('w')}
			
 
				+};
			
 
				+
			
 
				+/* Special treatment of identifiers */
			
 
				+
			
 
				+static TOKEN
			
 
				+idf2token(int hashing) {
			
 
				+	register TOKEN tk;
			
 
				+
			
 
				+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
			
 
				+	if (TOKEN_EQ(tk, IDF) && hashing) {
			
 
				+		/* return a one-token hash code */
			
 
				+		tk = idf_hashed(yytext);
			
 
				+	}
			
 
				+	return tk;
			
 
				+}
			
 
				+
			
 
				+/* Token sets for module algollike */
			
 
				+const TOKEN NonFinals[] = {
			
 
				+	IDF,		/* identifier */
			
 
				+	NORM('{'),
			
 
				+	NORM('('),
			
 
				+	NORM('a'),	/* abstract */
			
 
				+	NORM('b'),	/* boolean */
			
 
				+	NORM('B'),	/* break */
			
 
				+	CTRL('B'),	/* byte */
			
 
				+	NORM('c'),	/* case */
			
 
				+	NORM('C'),	/* catch */
			
 
				+	CTRL('C'),	/* char */
			
 
				+	META('c'),	/* class */
			
 
				+	META('C'),	/* continue */
			
 
				+	NORM('d'),	/* default */
			
 
				+	NORM('D'),	/* do */
			
 
				+	CTRL('D'),	/* double */
			
 
				+	NORM('e'),	/* else */
			
 
				+	NORM('E'),	/* extends */
			
 
				+	NORM('f'),	/* final */
			
 
				+	NORM('F'),	/* finally */
			
 
				+	CTRL('F'),	/* float */
			
 
				+	META('f'),	/* for */
			
 
				+	NORM('i'),	/* if */
			
 
				+	NORM('I'),	/* implements */
			
 
				+	CTRL('I'),	/* import */
			
 
				+	META('i'),	/* instanceof */
			
 
				+	META('I'),	/* int */
			
 
				+	MTCT('I'),	/* interface */
			
 
				+	NORM('l'),	/* long */
			
 
				+	NORM('n'),	/* native */
			
 
				+	NORM('N'),	/* new */
			
 
				+	NORM('p'),	/* package */
			
 
				+	NORM('P'),	/* private */
			
 
				+	CTRL('P'),	/* protected */
			
 
				+	META('p'),	/* public */
			
 
				+	NORM('r'),	/* return */
			
 
				+	NORM('s'),	/* short */
			
 
				+	NORM('S'),	/* static */
			
 
				+	CTRL('S'),	/* super */
			
 
				+	META('s'),	/* switch */
			
 
				+	META('S'),	/* synchronized */
			
 
				+	NORM('T'),	/* throw */
			
 
				+	CTRL('T'),	/* throws */
			
 
				+	NORM('v'),	/* void */
			
 
				+	NORM('V'),	/* volatile */
			
 
				+	NORM('w'),	/* while */
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN NonInitials[] = {
			
 
				+	NORM(')'),
			
 
				+	NORM('}'),
			
 
				+	NORM(';'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Openers[] = {
			
 
				+	NORM('{'),
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Closers[] = {
			
 
				+	NORM('}'),
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+
			
 
				+%}
			
 
				+
			
 
				+%option nounput
			
 
				+%option never-interactive
			
 
				+
			
 
				+%Start	Comment
			
 
				+
			
 
				+Layout		([ \t\r\f])
			
 
				+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
			
 
				+
			
 
				+Digit		([0-9a-fA-F])
			
 
				+
			
 
				+UniCode		(\\u{Digit}{Digit}{Digit}{Digit})
			
 
				+AnyQuoted	((\\.)|{UniCode})
			
 
				+StrChar		([^"\n\\]|{AnyQuoted})
			
 
				+ChrChar		([^'\n\\]|{AnyQuoted})
			
 
				+
			
 
				+StartComment	("/*")
			
 
				+EndComment	("*/")
			
 
				+SafeComChar	([^*\n])
			
 
				+UnsafeComChar	("*")
			
 
				+
			
 
				+SingleLineCom	("//".*)
			
 
				+
			
 
				+Idf		([A-Za-z][A-Za-z0-9_]*)
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+{StartComment}	{
			
 
				+		/*	We do not have one single pattern to match a comment
			
 
				+			(although one can be written), for two reasons.
			
 
				+			The matched string might overflow lex-internal buffers
			
 
				+			like yysbuf and yytext; and the pattern would be very
			
 
				+			complicated and overtax lex.
			
 
				+			So we break up the string into safe chunks and keep
			
 
				+			track of where we are in a start condition <Comment>.
			
 
				+		*/
			
 
				+		BEGIN Comment;
			
 
				+	}
			
 
				+
			
 
				+<Comment>{SafeComChar}+	{		/* safe comment chunk */
			
 
				+	}
			
 
				+
			
 
				+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
			
 
				+	}
			
 
				+
			
 
				+<Comment>"\n"		{		/* to break up long comments */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+<Comment>{EndComment}	{		/* end-of-comment */
			
 
				+		BEGIN INITIAL;
			
 
				+	}
			
 
				+
			
 
				+{SingleLineCom}"\n"	{		/* single-line comment */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+\"{StrChar}*\"	{			/* strings */
			
 
				+		return_ch('"');
			
 
				+	}
			
 
				+
			
 
				+\'{ChrChar}+\'	{			/* characters */
			
 
				+		return_ch('\'');
			
 
				+	}
			
 
				+
			
 
				+(0x)?{Digit}+("l"|"L")?	{		/* numeral, passed as an identifier */
			
 
				+		return_tk(IDF);
			
 
				+	}
			
 
				+
			
 
				+"import"{Layout}[^;]*;	{		/* import statement; ignore */
			
 
				+	}
			
 
				+
			
 
				+{Idf}/"("	{			/* identifier in front of ( */
			
 
				+		register TOKEN tk;
			
 
				+
			
 
				+		tk = idf2token(option_set('F'));
			
 
				+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
			
 
				+	}
			
 
				+
			
 
				+{Idf}	{				/* identifier */
			
 
				+		register TOKEN tk;
			
 
				+
			
 
				+		tk = idf2token(0 /* no hashing */);
			
 
				+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
			
 
				+	}
			
 
				+
			
 
				+\;	{				/* semicolon, conditionally ignored */
			
 
				+		if (option_set('f')) return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+\n	{				/* count newlines */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+{Layout}	{			/* ignore layout */
			
 
				+	}
			
 
				+
			
 
				+{ASCII95}	{			/* copy other text */
			
 
				+		return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+.	{				/* count non-ASCII chars */
			
 
				+		lex_non_ascii_cnt++;
			
 
				+	}
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+/* Language-INdependent Code */
			
 
				+
			
 
				+void
			
 
				+yystart(void) {
			
 
				+	BEGIN INITIAL;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+yywrap(void) {
			
 
				+	return 1;
			
 
				+}
			
--- a/utils/sim_pasc/lang.h
+++ b/utils/sim_pasc/lang.h
@@ -0,0 +1,32 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: lang.h,v 1.2 1998/01/21 14:27:51 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	The token-providing module 'lang' has three interfaces:
			
 
				+	-	lang.h, which provides access to the lowest-level token
			
 
				+			routines, to be used by the next level.
			
 
				+	-	lex.h, which provides the lex variables, to be used by
			
 
				+			all and sundry.
			
 
				+	-	language.h, which provides language-specific info about
			
 
				+			tokens, concerning their suitability as initial
			
 
				+			and final tokens, to be used by higher levels.
			
 
				+			
			
 
				+	This structure is not satisfactory, but it is also unreasonable
			
 
				+	to combine them in one interface.
			
 
				+
			
 
				+	There is no single lang.c; rather it is represented by the
			
 
				+	various Xlang.c files generated from the Xlang.l files.
			
 
				+*/
			
 
				+
			
 
				+#include	"token.h"
			
 
				+
			
 
				+/* useful macros */
			
 
				+#define	return_tk(tk)	{lex_tk_cnt++; lex_token = (tk); return 1;}
			
 
				+#define	return_ch(ch)	{lex_tk_cnt++; lex_token = int2TOKEN((int)(ch)); return 1;}
			
 
				+#define	return_eol()	{lex_nl_cnt++; lex_token = EOL; return 1;}
			
 
				+
			
 
				+extern int yylex(void);
			
 
				+extern void yystart(void);
			
 
				+extern FILE *yyin;
			
--- a/utils/sim_pasc/language.h
+++ b/utils/sim_pasc/language.h
@@ -0,0 +1,17 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: language.h,v 1.1 1997/06/20 12:03:15 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	The abstract class Language contains the routines InitLanguage,
			
 
				+	MayBeStartOfRun and CheckRun which describe in some sense the
			
 
				+	language and which are required by compare.c.
			
 
				+	
			
 
				+	These routines must be provided by all Xlang.l files.
			
 
				+*/
			
 
				+
			
 
				+#include	"token.h"
			
 
				+
			
 
				+extern void InitLanguage(void);
			
 
				+extern int MayBeStartOfRun(TOKEN ch);
			
 
				+extern unsigned int CheckRun(const TOKEN *str, unsigned int size);
			
--- a/utils/sim_pasc/lex.c
+++ b/utils/sim_pasc/lex.c
@@ -0,0 +1,16 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: lex.c,v 1.3 1998/02/03 14:28:26 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	The communication variables, as set by yylex, NextStreamTokenObtained
			
 
				+	and NextTextTokenObtained.
			
 
				+*/
			
 
				+
			
 
				+#include	"token.h"
			
 
				+#include	"lex.h"
			
 
				+
			
 
				+TOKEN lex_token;			/* token produced, or EOL */
			
 
				+unsigned int lex_nl_cnt;		/* line count */
			
 
				+unsigned int lex_tk_cnt;		/* token position */
			
 
				+unsigned int lex_non_ascii_cnt;		/* # of non-ASCII chars found */
			
--- a/utils/sim_pasc/lex.h
+++ b/utils/sim_pasc/lex.h
@@ -0,0 +1,19 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: lex.h,v 2.5 1998/02/03 14:28:27 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Since the lex_X variables are hoisted unchanged through the levels
			
 
				+	lang, stream, and buff, to be used by pass1, pass2, etc., they
			
 
				+	have to be placed in a module of their own.
			
 
				+*/
			
 
				+
			
 
				+#include	"token.h"
			
 
				+
			
 
				+/* special tokens */
			
 
				+#define	EOL		NORM(0377)	/* end of line */
			
 
				+
			
 
				+extern TOKEN lex_token;			/* token produced, or EOL */
			
 
				+extern unsigned int lex_nl_cnt;		/* line count */
			
 
				+extern unsigned int lex_tk_cnt;		/* token position */
			
 
				+extern unsigned int lex_non_ascii_cnt;	/* # of non-ASCII chars found */
			
--- a/utils/sim_pasc/lisplang.l
+++ b/utils/sim_pasc/lisplang.l
@@ -0,0 +1,123 @@
 
				+%{
			
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: lisplang.l,v 2.9 2007/08/29 09:10:33 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	LISP language front end for the similarity tester.
			
 
				+	Author:	Gertjan Akkerman <[email protected]>
			
 
				+	Date:	Thu, 9 Apr 87 11:15:23 MDT
			
 
				+*/
			
 
				+
			
 
				+#include	"language.h"
			
 
				+#include	"token.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"lang.h"
			
 
				+
			
 
				+/* Language-dependent Code */
			
 
				+#include	"idf.h"
			
 
				+
			
 
				+static const struct idf reserved[] = {
			
 
				+	{"append",	NORM('a')},
			
 
				+	{"append1",	NORM('b')},
			
 
				+	{"atom",	NORM('t')},
			
 
				+	{"car",		NORM('h')},
			
 
				+	{"cdr",		NORM('t')},
			
 
				+	{"cond",	NORM('c')},
			
 
				+	{"cons",	NORM('s')},
			
 
				+	{"defun",	NORM('u')},
			
 
				+	{"do",		NORM('d')},
			
 
				+	{"eq",		NORM('e')},
			
 
				+	{"equal",	NORM('e')},		/* See eq */
			
 
				+	{"for",		NORM('f')},
			
 
				+	{"if",		NORM('i')},
			
 
				+	{"list",	NORM('l')},
			
 
				+	{"nconc",	NORM('n')},
			
 
				+	{"rplaca",	NORM('A')},
			
 
				+	{"rplacd",	NORM('D')}
			
 
				+};
			
 
				+
			
 
				+/* Token sets for module algollike */
			
 
				+const TOKEN NonFinals[] = {
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN NonInitials[] = {
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Openers[] = {
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Closers[] = {
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+
			
 
				+%}
			
 
				+
			
 
				+%option nounput
			
 
				+%option never-interactive
			
 
				+
			
 
				+%Start	Comment
			
 
				+
			
 
				+Layout		([ \t\r\f])
			
 
				+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
			
 
				+
			
 
				+AnyQuoted	(\\.)
			
 
				+StrChar		([^"\n\\]|{AnyQuoted})
			
 
				+ChrChar		([^'\\]|{AnyQuoted})
			
 
				+
			
 
				+IdfChar		([-!#$%&*+,/0-9:;<=>?@A-Z\\^_`a-z{}~])
			
 
				+
			
 
				+EscIdf		(({IdfChar}|\\.)+)
			
 
				+QuotIdf		("|"[^\|\n]*"|")
			
 
				+Idf		({EscIdf}|{QuotIdf})
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+";".*$	{				/* comment */
			
 
				+	}
			
 
				+
			
 
				+\"{StrChar}*\"	{			/* strings */
			
 
				+		return_ch('"');
			
 
				+	}
			
 
				+
			
 
				+{Idf}	{				/* identifier */
			
 
				+		return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF));
			
 
				+	}
			
 
				+
			
 
				+\n	{				/* count newlines */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+{Layout}	{			/* ignore layout */
			
 
				+	}
			
 
				+
			
 
				+{ASCII95}	{			/* copy other text */
			
 
				+		return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+.	{				/* count non-ASCII chars */
			
 
				+		lex_non_ascii_cnt++;
			
 
				+	}
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+/* Language-INdependent Code */
			
 
				+
			
 
				+void
			
 
				+yystart(void) {
			
 
				+	BEGIN INITIAL;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+yywrap(void) {
			
 
				+	return 1;
			
 
				+}
			
--- a/utils/sim_pasc/m2lang.l
+++ b/utils/sim_pasc/m2lang.l
@@ -0,0 +1,319 @@
 
				+%{
			
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: m2lang.l,v 2.9 2007/08/29 09:10:33 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	Modula-2 language front end for the similarity tester.
			
 
				+	Author:	Dick Grune <[email protected]>
			
 
				+*/
			
 
				+
			
 
				+#include	"options.h"
			
 
				+#include	"algollike.h"
			
 
				+#include	"token.h"
			
 
				+#include	"idf.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"lang.h"
			
 
				+
			
 
				+/* Language-dependent Code */
			
 
				+
			
 
				+/*	Most Modula-2 programs start with a number of IMPORTs that look
			
 
				+	very similar from program to program.  These are skipped by ignoring
			
 
				+	the reserved words IMPLEMENTATION, DEFINITION, MODULE, IMPORT
			
 
				+	and FROM, having a flag skip_imports, and start reacting only
			
 
				+	at the first non-ignored reserved word.
			
 
				+
			
 
				+	Also, the nesting comments require a state variable.
			
 
				+*/
			
 
				+
			
 
				+/* Additional state variables, set in yystart() */
			
 
				+static int skip_imports;
			
 
				+static int comment_level;
			
 
				+
			
 
				+/* Data for module idf */
			
 
				+
			
 
				+static const struct idf reserved[] = {
			
 
				+	{"AND",		NORM('&')},
			
 
				+	{"ARRAY",	NORM('A')},
			
 
				+	{"BEGIN",	NORM('{')},
			
 
				+	{"BY",		NORM('B')},
			
 
				+	{"CASE",	NORM('c')},
			
 
				+	{"CONST",	NORM('C')},
			
 
				+	{"DEFINITION",	SKIP},
			
 
				+	{"DIV",		NORM('/')},
			
 
				+	{"DO",		NORM('D')},
			
 
				+	{"ELSE",	NORM('e')},
			
 
				+	{"ELSIF",	NORM('e')},
			
 
				+	{"END",		NORM('}')},
			
 
				+	{"EXIT",	NORM('E')},
			
 
				+	{"EXPORT",	CTRL('E')},
			
 
				+	{"FOR",		NORM('F')},
			
 
				+	{"FROM",	SKIP},
			
 
				+	{"IF",		NORM('i')},
			
 
				+	{"IMPLEMENTATION", SKIP},
			
 
				+	{"IMPORT",	SKIP},
			
 
				+	{"IN",		NORM('I')},
			
 
				+	{"LOOP",	NORM('l')},
			
 
				+	{"MOD",		NORM('%')},
			
 
				+	{"MODULE",	SKIP},
			
 
				+	{"NOT",		NORM('~')},
			
 
				+	{"OF",		SKIP},
			
 
				+	{"OR",		NORM('O')},
			
 
				+	{"POINTER",	NORM('p')},
			
 
				+	{"PROCEDURE",	NORM('P')},
			
 
				+	{"QUALIFIED",	NORM('q')},
			
 
				+	{"RECORD",	NORM('r')},
			
 
				+	{"REPEAT",	NORM('R')},
			
 
				+	{"RETURN",	CTRL('r')},
			
 
				+	{"SET",		NORM('s')},
			
 
				+	{"THEN",	SKIP},
			
 
				+	{"TO",		NORM('t')},
			
 
				+	{"TYPE",	NORM('T')},
			
 
				+	{"UNTIL",	NORM('u')},
			
 
				+	{"VAR",		NORM('v')},
			
 
				+	{"WHILE",	NORM('w')},
			
 
				+	{"WITH",	NORM('W')},
			
 
				+};
			
 
				+
			
 
				+static const struct idf standard[] = {
			
 
				+	{"ABS",		META('a')},
			
 
				+	{"ADDRESS",	META('A')},
			
 
				+	{"ALLOCATE",	MTCT('A')},
			
 
				+	{"BITSET",	META('b')},
			
 
				+	{"BOOLEAN",	META('B')},
			
 
				+	{"CAP",		META('c')},
			
 
				+	{"CARDINAL",	META('C')},
			
 
				+	{"CHAR",	MTCT('C')},
			
 
				+	{"CHR",		META('x')},
			
 
				+	{"DEALLOCATE",	META('d')},
			
 
				+	{"DEC",		META('D')},
			
 
				+	{"EXCL",	META('e')},
			
 
				+	{"FALSE",	META('f')},
			
 
				+	{"FLOAT",	META('F')},
			
 
				+	{"HALT",	META('h')},
			
 
				+	{"HIGH",	META('H')},
			
 
				+	{"INC",		META('i')},
			
 
				+	{"INCL",	META('I')},
			
 
				+	{"INTEGER",	MTCT('I')},
			
 
				+	{"LONGCARD",	META('L')},
			
 
				+	{"LONGINT",	META('L')},
			
 
				+	{"LONGREAL",	META('L')},
			
 
				+	{"MAX",		META('m')},
			
 
				+	{"MIN",		META('M')},
			
 
				+	{"NEWPROCESS",	META('n')},
			
 
				+	{"NIL",		META('N')},
			
 
				+	{"ODD",		META('o')},
			
 
				+	{"ORD",		META('O')},
			
 
				+	{"PROC",	META('p')},
			
 
				+	{"REAL",	META('r')},
			
 
				+	{"SIZE",	META('s')},
			
 
				+	{"SYSTEM",	META('S')},
			
 
				+	{"TRANSFER",	META('t')},
			
 
				+	{"TRUE",	META('T')},
			
 
				+	{"TRUNC",	MTCT('T')},
			
 
				+	{"VAL",		META('v')},
			
 
				+	{"WORD",	META('w')}
			
 
				+};
			
 
				+
			
 
				+/* Special treatment of identifiers */
			
 
				+
			
 
				+static TOKEN
			
 
				+idf2token(int hashing) {
			
 
				+	register TOKEN tk;
			
 
				+
			
 
				+	/* the token can be on two lists, reserved and standard */
			
 
				+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
			
 
				+
			
 
				+	/* is it one of the keywords to be ignored? */
			
 
				+	if (TOKEN_EQ(tk, SKIP)) return tk;
			
 
				+
			
 
				+	/*	The statement below is a significant comment
			
 
				+		on the value of state variables.
			
 
				+	*/
			
 
				+	if (!TOKEN_EQ(tk, IDF)) {
			
 
				+		/* reserved word, stop the skipping */
			
 
				+		skip_imports = 0;
			
 
				+	}
			
 
				+	else {
			
 
				+		/* it is an identifier but not a reserved word */
			
 
				+		if (skip_imports) {
			
 
				+			/* skip it */
			
 
				+			tk = 0;
			
 
				+		}
			
 
				+		else {
			
 
				+			/* look further */
			
 
				+			tk = idf_in_list(yytext, standard, sizeof standard, IDF);
			
 
				+			if (TOKEN_EQ(tk, IDF) && hashing) {
			
 
				+				/* return a one-token hash code */
			
 
				+				tk = idf_hashed(yytext);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return tk;
			
 
				+}
			
 
				+
			
 
				+/* Token sets for module algollike */
			
 
				+const TOKEN NonFinals[] = {
			
 
				+	IDF,		/* identifier */
			
 
				+	NORM('{'),	/* also BEGIN */
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NORM('A'),	/* ARRAY */
			
 
				+	NORM('c'),	/* CASE */
			
 
				+	NORM('C'),	/* CONST */
			
 
				+	NORM('E'),	/* EXIT */
			
 
				+	NORM('F'),	/* FOR */
			
 
				+	NORM('i'),	/* IF */
			
 
				+	NORM('l'),	/* LOOP */
			
 
				+	NORM('p'),	/* POINTER */
			
 
				+	NORM('P'),	/* PROCEDURE */
			
 
				+	NORM('r'),	/* RECORD */
			
 
				+	NORM('R'),	/* REPEAT */
			
 
				+	CTRL('R'),	/* RETURN */
			
 
				+	NORM('s'),	/* SET */
			
 
				+	NORM('T'),	/* TYPE */
			
 
				+	NORM('v'),	/* VAR */
			
 
				+	NORM('w'),	/* WHILE */
			
 
				+	NORM('W'),	/* WITH */
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN NonInitials[] = {
			
 
				+	NORM('}'),
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NORM(';'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Openers[] = {
			
 
				+	NORM('{'),
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Closers[] = {
			
 
				+	NORM('}'),
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+
			
 
				+%}
			
 
				+
			
 
				+%option nounput
			
 
				+%option never-interactive
			
 
				+
			
 
				+%Start	Comment
			
 
				+
			
 
				+Layout		([ \t\r\f])
			
 
				+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
			
 
				+
			
 
				+AnyQuoted	(\\.)
			
 
				+QuStrChar	([^"\n\\]|{AnyQuoted})
			
 
				+ApoStrChar	([^'\n\\]|{AnyQuoted})
			
 
				+
			
 
				+StartComment	("(*")
			
 
				+EndComment	("*)")
			
 
				+SafeComChar	([^*\n])
			
 
				+UnsafeComChar	("*")
			
 
				+
			
 
				+Digit		([0-9a-fA-F])
			
 
				+Idf		([A-Za-z][A-Za-z0-9_]*)
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+{StartComment}	{			/* See clang.l */
			
 
				+		/*	Lex itself is incapable of handling Modula-2's
			
 
				+			nested comments. So let's help it a bit.
			
 
				+		*/
			
 
				+		if (comment_level == 0) {
			
 
				+			BEGIN Comment;
			
 
				+		}
			
 
				+		comment_level++;
			
 
				+	}
			
 
				+
			
 
				+<Comment>{SafeComChar}+	{		/* safe comment chunk */
			
 
				+	}
			
 
				+
			
 
				+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
			
 
				+	}
			
 
				+
			
 
				+<Comment>"\n"		{		/* to break up long comments */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+<Comment>{EndComment}	{		/* end-of-comment */
			
 
				+		comment_level--;
			
 
				+		if (comment_level == 0) {
			
 
				+			BEGIN INITIAL;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+\"{QuStrChar}*\"	{		/* quoted strings */
			
 
				+		return_ch('"');
			
 
				+	}
			
 
				+
			
 
				+\'{ApoStrChar}*\'	{		/* apostrophed strings */
			
 
				+		return_ch('"');
			
 
				+	}
			
 
				+
			
 
				+{Digit}+("B"|"C"|"H")?	{		/* numeral, passed as an identifier */
			
 
				+		return_tk(IDF);
			
 
				+	}
			
 
				+
			
 
				+"END"{Layout}*{Idf}	{		/* ignore identifier after END */
			
 
				+		return_tk(idf_in_list("END", reserved, sizeof reserved, SKIP));
			
 
				+	}
			
 
				+
			
 
				+{Idf}/"("	{			/* identifier in front of ( */
			
 
				+		register TOKEN tk;
			
 
				+
			
 
				+		tk = idf2token(option_set('F'));
			
 
				+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
			
 
				+	}
			
 
				+
			
 
				+{Idf}	{				/* identifier */
			
 
				+		register TOKEN tk;
			
 
				+
			
 
				+		tk = idf2token(0 /* no hashing */);
			
 
				+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
			
 
				+	}
			
 
				+
			
 
				+"<>"	{				/* <>, special equivalence */
			
 
				+		return_ch('#');
			
 
				+	}
			
 
				+
			
 
				+\;	{				/* semicolon, conditionally ignored */
			
 
				+		if (option_set('f')) return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+\n	{				/* count newlines */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+{Layout}	{			/* ignore layout */
			
 
				+	}
			
 
				+
			
 
				+{ASCII95}	{			/* copy other text */
			
 
				+		if (!skip_imports) return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+.	{				/* count non-ASCII chars */
			
 
				+		lex_non_ascii_cnt++;
			
 
				+	}
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+/* Language-INdependent Code */
			
 
				+
			
 
				+void
			
 
				+yystart(void) {
			
 
				+	skip_imports = 1;
			
 
				+	comment_level = 0;
			
 
				+	BEGIN INITIAL;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+yywrap(void) {
			
 
				+	return 1;
			
 
				+}
			
--- a/utils/sim_pasc/miralang.l
+++ b/utils/sim_pasc/miralang.l
@@ -0,0 +1,131 @@
 
				+%{
			
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: miralang.l,v 1.3 2007/08/29 09:10:34 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	Miranda language front end for the similarity tester.
			
 
				+	Author:	Emma Norling ([email protected])
			
 
				+	Date:	Nov 1998
			
 
				+*/
			
 
				+
			
 
				+#include	"language.h"
			
 
				+#include	"token.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"lang.h"
			
 
				+
			
 
				+/* Language-dependent Code */
			
 
				+#include	"idf.h"
			
 
				+
			
 
				+static const struct idf reserved[] = {
			
 
				+	{"abstype",	NORM('a')},
			
 
				+	{"bool",	NORM('b')},
			
 
				+	{"char",	NORM('c')},
			
 
				+	{"const",	META('c')},
			
 
				+	{"div",		NORM('d')},
			
 
				+	{"False",	NORM('F')},
			
 
				+	{"if",		NORM('i')},
			
 
				+	{"mod",		NORM('m')},
			
 
				+	{"num",		NORM('n')},
			
 
				+	{"otherwise",	NORM('o')},
			
 
				+	{"readvals",	NORM('r')},
			
 
				+	{"show",	NORM('s')},
			
 
				+	{"sys_message",	META('s')},
			
 
				+	{"True",	NORM('T')},
			
 
				+	{"type",	NORM('t')},
			
 
				+	{"where",	NORM('w')},
			
 
				+	{"with",	META('w')}
			
 
				+};
			
 
				+
			
 
				+/* Token sets for module algollike */
			
 
				+const TOKEN NonFinals[] = {
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NORM('='),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN NonInitials[] = {
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Openers[] = {
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NORM('='),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Closers[] = {
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+
			
 
				+%}
			
 
				+
			
 
				+%option nounput
			
 
				+%option never-interactive
			
 
				+
			
 
				+%Start	Comment
			
 
				+
			
 
				+Layout		([ \t\r\f])
			
 
				+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
			
 
				+
			
 
				+AnyQuoted	(\\.)
			
 
				+StrChar		([^"\n\\]|{AnyQuoted})
			
 
				+ChrChar		([^'\\]|{AnyQuoted})
			
 
				+
			
 
				+Idf		([A-Za-z][A-Za-z0-9_']*)
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+"||".*$	{				/* comment */
			
 
				+	}
			
 
				+
			
 
				+\"{StrChar}*\"	{			/* strings */
			
 
				+		return_ch('"');
			
 
				+	}
			
 
				+
			
 
				+\'{ChrChar}\'	{			/* characters */
			
 
				+		return_ch('\'');
			
 
				+	}
			
 
				+
			
 
				+\%{Layout}*include.*	{		/* skip %include line */
			
 
				+	}
			
 
				+
			
 
				+\%{Layout}*insert.*	{		/* skip %insert line */
			
 
				+	}
			
 
				+
			
 
				+{Idf}	{				/* identifier */
			
 
				+		return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF));
			
 
				+	}
			
 
				+
			
 
				+\n	{				/* count newlines */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+{Layout}	{			/* ignore layout */
			
 
				+	}
			
 
				+
			
 
				+{ASCII95}	{			/* copy other text */
			
 
				+		return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+.	{				/* count non-ASCII chars */
			
 
				+		lex_non_ascii_cnt++;
			
 
				+	}
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+/* Language-INdependent Code */
			
 
				+
			
 
				+void
			
 
				+yystart(void) {
			
 
				+	BEGIN INITIAL;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+yywrap(void) {
			
 
				+	return 1;
			
 
				+}
			
--- a/utils/sim_pasc/options.c
+++ b/utils/sim_pasc/options.c
@@ -0,0 +1,123 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: options.c,v 1.3 2001/11/13 12:55:53 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<stdlib.h>
			
 
				+
			
 
				+#include	"options.h"
			
 
				+
			
 
				+static char options[128];
			
 
				+
			
 
				+static void bad_option(
			
 
				+	const char *progname, const struct option *optlist, char *msg, int c
			
 
				+);
			
 
				+static int opt_value(const struct option *op, const char *arg, char *argv[]);
			
 
				+
			
 
				+static int do_arg(
			
 
				+	const char *progname, const struct option *optlist,
			
 
				+	const char *arg, char *argv[]
			
 
				+);
			
 
				+
			
 
				+int
			
 
				+do_options(
			
 
				+	const char *progname, const struct option *optlist,
			
 
				+	int argc, char *argv[]
			
 
				+) {
			
 
				+	int skips = 0;
			
 
				+
			
 
				+	while (argc > 0 && argv[0][0] == '-' && argv[0][1] != '\0') {
			
 
				+		int consumed = do_arg(progname, optlist, &argv[0][1], argv);
			
 
				+
			
 
				+		argc -= consumed, argv += consumed, skips += consumed;
			
 
				+	}
			
 
				+
			
 
				+	return skips;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+option_set(char ch) {
			
 
				+	return options[(int)ch];
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+do_arg(
			
 
				+	const char *progname, const struct option *optlist,
			
 
				+	const char *arg, char *argv[]
			
 
				+) {
			
 
				+	int consumed = 0;
			
 
				+
			
 
				+	while (*arg) {
			
 
				+		/* treat argument character */
			
 
				+		register char opc = *arg++;
			
 
				+		register const struct option *op;
			
 
				+
			
 
				+		for (op = optlist; op->op_char; op++) {
			
 
				+			/* for every allowed option */
			
 
				+			if (opc == op->op_char) {
			
 
				+				options[(int)opc]++;
			
 
				+				if (op->op_indicator != ' ') {
			
 
				+					consumed = opt_value(op, arg, argv);
			
 
				+					if (consumed < 0) {
			
 
				+						bad_option(progname, (struct option *)0,
			
 
				+							" option -%c requires another argument",
			
 
				+							op->op_char
			
 
				+						);
			
 
				+						/*NOTREACHED*/
			
 
				+					}
			
 
				+				}
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		if (!op->op_char) {
			
 
				+			bad_option(progname, optlist,
			
 
				+				"*option -%c unknown", opc
			
 
				+			);
			
 
				+			/*NOTREACHED*/
			
 
				+		}
			
 
				+		if (consumed) break;
			
 
				+	}
			
 
				+	if (!consumed) {
			
 
				+		consumed = 1;
			
 
				+	}
			
 
				+	
			
 
				+	return consumed;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+opt_value(const struct option *op, const char *arg, char *argv[]) {
			
 
				+	/* locate the option value */
			
 
				+	if (*arg) {
			
 
				+		/* argument is continuation of option */
			
 
				+		*op->op_stringp = arg;
			
 
				+		return 1;
			
 
				+	}
			
 
				+	else {
			
 
				+		/* argument follows option */
			
 
				+		if (!argv[1]) return -1;
			
 
				+		*op->op_stringp = argv[1];
			
 
				+		return 2;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+bad_option(
			
 
				+	const char *progname, const struct option *optlist, char *msg, int c
			
 
				+) {
			
 
				+	fprintf(stderr, "%s: ", progname);
			
 
				+	fprintf(stderr, &msg[1], c);
			
 
				+	fprintf(stderr, "\n");
			
 
				+
			
 
				+	if (msg[0] != ' ') {
			
 
				+		register const struct option *op;
			
 
				+
			
 
				+		fprintf(stderr, "Possible options are:\n");
			
 
				+		for (op = optlist; op->op_char; op++) {
			
 
				+			fprintf(stderr, "\t-%c%c\t%s\n",
			
 
				+				op->op_char, op->op_indicator, op->op_text
			
 
				+			);
			
 
				+		}
			
 
				+	}
			
 
				+	exit(1);
			
 
				+}
			
--- a/utils/sim_pasc/options.h
+++ b/utils/sim_pasc/options.h
@@ -0,0 +1,20 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: options.h,v 1.3 2001/11/13 12:55:53 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Setting and consulting command line options
			
 
				+*/
			
 
				+
			
 
				+struct option {
			
 
				+	char op_char;		/* char as in call */
			
 
				+	char *op_text;		/* elucidating text */
			
 
				+	char op_indicator;	/* type indicator, N = int, F = file name */
			
 
				+	const char **op_stringp;/* string value to be picked up */
			
 
				+};
			
 
				+
			
 
				+extern int option_set(char ch);
			
 
				+extern int do_options(
			
 
				+	const char *progname, const struct option *optlist,
			
 
				+	int argc, char *argv[]
			
 
				+);
			
--- a/utils/sim_pasc/pascallang.l
+++ b/utils/sim_pasc/pascallang.l
@@ -0,0 +1,256 @@
 
				+%{
			
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: pascallang.l,v 2.9 2007/08/29 09:10:35 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	PASCAL language front end for the similarity tester.
			
 
				+	Author:	Maarten van der Meulen <[email protected]>
			
 
				+*/
			
 
				+
			
 
				+#include	"options.h"
			
 
				+#include	"algollike.h"
			
 
				+#include	"token.h"
			
 
				+#include	"idf.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"lang.h"
			
 
				+
			
 
				+/* Language-dependent Code */
			
 
				+
			
 
				+/* Data for module idf */
			
 
				+
			
 
				+static const struct idf ppcmd[] = {
			
 
				+	{"define",	META('d')},
			
 
				+	{"else",	META('e')},
			
 
				+	{"endif",	META('E')},
			
 
				+	{"if",		META('i')},
			
 
				+	{"ifdef",	META('I')},
			
 
				+	{"ifndef",	META('x')},
			
 
				+	{"include",	MTCT('I')},
			
 
				+	{"line",	META('l')},
			
 
				+	{"undef",	META('u')}
			
 
				+};
			
 
				+
			
 
				+static const struct idf reserved[] = {
			
 
				+	{"and",		NORM('&')},
			
 
				+	{"array",	NORM('A')},
			
 
				+	{"begin",	NORM('{')},
			
 
				+	{"case",	NORM('c')},
			
 
				+	{"const",	NORM('C')},
			
 
				+	{"div",		NORM('/')},
			
 
				+	{"do",		NORM('D')},
			
 
				+	{"downto",	NORM('d')},
			
 
				+	{"else",	NORM('e')},
			
 
				+	{"end",		NORM('}')},
			
 
				+	{"extern",	CTRL('E')},
			
 
				+	{"file",	NORM('F')},
			
 
				+	{"for",		NORM('f')},
			
 
				+	{"function",	NORM('p')},	/* Equal to procedure */
			
 
				+	{"goto",	NORM('g')},
			
 
				+	{"if",		NORM('i')},
			
 
				+	{"in",		NORM('I')},
			
 
				+	{"label",	NORM('l')},
			
 
				+	{"mod",		NORM('%')},
			
 
				+	{"nil",		NORM('n')},
			
 
				+	{"not",		NORM('!')},
			
 
				+	{"of",		SKIP},
			
 
				+	{"or",		NORM('|')},
			
 
				+	{"packed",	NORM('P')},
			
 
				+	{"procedure",	NORM('p')},
			
 
				+	{"program",	SKIP},
			
 
				+	{"record",	NORM('r')},
			
 
				+	{"repeat",	NORM('R')},
			
 
				+	{"set",		NORM('s')},
			
 
				+	{"then",	SKIP},
			
 
				+	{"to",		NORM('t')},
			
 
				+	{"type",	NORM('T')},
			
 
				+	{"until",	NORM('u')},
			
 
				+	{"var",		NORM('v')},
			
 
				+	{"while",	NORM('w')},
			
 
				+	{"with",	NORM('W')}
			
 
				+};
			
 
				+
			
 
				+/* Special treatment of identifiers */
			
 
				+
			
 
				+static void
			
 
				+lower_case(char *str) {
			
 
				+	/*	Turns upper case into lower case, since Pascal does not
			
 
				+		distinguish between them.
			
 
				+	*/
			
 
				+	register char *s;
			
 
				+
			
 
				+	for (s = str; *s; s++) {
			
 
				+		if ('A' <= *s && *s <= 'Z') {
			
 
				+			*s += (-'A' + 'a');
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static TOKEN
			
 
				+idf2token(int hashing) {
			
 
				+	register TOKEN tk;
			
 
				+
			
 
				+	lower_case(yytext);
			
 
				+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
			
 
				+	if (TOKEN_EQ(tk, IDF) && hashing) {
			
 
				+		/* return a one-token hash code */
			
 
				+		tk = idf_hashed(yytext);
			
 
				+	}
			
 
				+	return tk;
			
 
				+}
			
 
				+
			
 
				+/* Token sets for module algollike */
			
 
				+const TOKEN NonFinals[] = {
			
 
				+	IDF,		/* identifier */
			
 
				+	NORM('{'),	/* also begin */
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NORM('A'),	/* array */
			
 
				+	NORM('c'),	/* case */
			
 
				+	NORM('C'),	/* const */
			
 
				+	NORM('/'),	/* div */
			
 
				+	CTRL('E'),	/* extern */
			
 
				+	NORM('F'),	/* file */
			
 
				+	NORM('f'),	/* for */
			
 
				+	NORM('g'),	/* goto */
			
 
				+	NORM('i'),	/* if */
			
 
				+	NORM('l'),	/* label */
			
 
				+	NORM('P'),	/* packed */
			
 
				+	NORM('p'),	/* procedure/function */
			
 
				+	NORM('r'),	/* record */
			
 
				+	NORM('R'),	/* repeat */
			
 
				+	NORM('s'),	/* set */
			
 
				+	NORM('T'),	/* type */
			
 
				+	NORM('v'),	/* var */
			
 
				+	NORM('w'),	/* while */
			
 
				+	NORM('W'),	/* with */
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN NonInitials[] = {
			
 
				+	NORM(')'),
			
 
				+	NORM('}'),
			
 
				+	NORM(';'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Openers[] = {
			
 
				+	NORM('{'),
			
 
				+	NORM('('),
			
 
				+	NORM('['),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+const TOKEN Closers[] = {
			
 
				+	NORM('}'),
			
 
				+	NORM(')'),
			
 
				+	NORM(']'),
			
 
				+	NOTOKEN
			
 
				+};
			
 
				+
			
 
				+%}
			
 
				+
			
 
				+%option nounput
			
 
				+%option never-interactive
			
 
				+
			
 
				+%Start	Comment
			
 
				+
			
 
				+Layout		([ \t\r\f])
			
 
				+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
			
 
				+
			
 
				+AnyQuoted	(\\.)
			
 
				+StrChar		([^'\n\\]|{AnyQuoted})
			
 
				+
			
 
				+StartComment	("{"|"(*")
			
 
				+EndComment	("}"|"*)")
			
 
				+SafeComChar	([^*}\n])
			
 
				+UnsafeComChar	("*")
			
 
				+
			
 
				+Digit		([0-9])
			
 
				+Idf		([A-Za-z][A-Za-z0-9_]*)
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+{StartComment}	{			/* See clang.l */
			
 
				+		BEGIN Comment;
			
 
				+	}
			
 
				+
			
 
				+<Comment>{SafeComChar}+	{		/* safe comment chunk */
			
 
				+	}
			
 
				+
			
 
				+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
			
 
				+	}
			
 
				+
			
 
				+<Comment>"\n"		{		/* to break up long comments */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+<Comment>{EndComment}	{		/* end-of-comment */
			
 
				+		BEGIN INITIAL;
			
 
				+	}
			
 
				+
			
 
				+\'{StrChar}*\'	{			/* character strings */
			
 
				+		return_ch('"');
			
 
				+	}
			
 
				+
			
 
				+^#{Layout}*include.*	{		/* ignore #include lines */
			
 
				+	}
			
 
				+
			
 
				+^#{Layout}*{Idf}	{		/* a preprocessor line */
			
 
				+		register char *idf = yytext+1;
			
 
				+
			
 
				+		/* skip layout in front of preprocessor identifier */
			
 
				+		while (*idf == ' ' || *idf == '\t') {
			
 
				+			idf++;
			
 
				+		}
			
 
				+		return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#')));
			
 
				+	}
			
 
				+
			
 
				+{Digit}+	{			/* numeral, passed as an identifier */
			
 
				+		return_tk(IDF);
			
 
				+	}
			
 
				+
			
 
				+{Idf}/"("	{			/* identifier in front of ( */
			
 
				+		register TOKEN tk;
			
 
				+
			
 
				+		tk = idf2token(option_set('F'));
			
 
				+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
			
 
				+	}
			
 
				+
			
 
				+{Idf}	{				/* identifier */
			
 
				+		register TOKEN tk;
			
 
				+
			
 
				+		tk = idf2token(0 /* no hashing */);
			
 
				+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
			
 
				+	}
			
 
				+
			
 
				+\;	{				/* semicolon, conditionally ignored */
			
 
				+		if (option_set('f')) return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+\n	{				/* count newlines */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+{Layout}	{			/* ignore layout */
			
 
				+	}
			
 
				+
			
 
				+{ASCII95}	{			/* copy other text */
			
 
				+		return_ch(yytext[0]);
			
 
				+	}
			
 
				+
			
 
				+.	{				/* count non-ASCII chars */
			
 
				+		lex_non_ascii_cnt++;
			
 
				+	}
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+/* Language-INdependent Code */
			
 
				+
			
 
				+void
			
 
				+yystart(void) {
			
 
				+	BEGIN INITIAL;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+yywrap(void) {
			
 
				+	return 1;
			
 
				+}
			
--- a/utils/sim_pasc/pass1.c
+++ b/utils/sim_pasc/pass1.c
@@ -0,0 +1,119 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: pass1.c,v 2.8 2007/08/27 09:57:32 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<string.h>
			
 
				+
			
 
				+#include	"debug.par"
			
 
				+#include	"sim.h"
			
 
				+#include	"text.h"
			
 
				+#include	"tokenarray.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"error.h"
			
 
				+#include	"pass1.h"
			
 
				+
			
 
				+#ifdef	DB_TEXT
			
 
				+static void db_print_text(const struct text *);
			
 
				+#endif
			
 
				+
			
 
				+static void print_count(unsigned int cnt, const char *);
			
 
				+
			
 
				+void
			
 
				+Pass1(int argc, char *argv[]) {
			
 
				+	register int n;
			
 
				+
			
 
				+	InitText(argc);
			
 
				+	InitTokenArray();
			
 
				+
			
 
				+	/* assume all texts to be new */
			
 
				+	NumberOfNewTexts = NumberOfTexts;
			
 
				+
			
 
				+	/* read the files */
			
 
				+	for (n = 0; n < NumberOfTexts; n++) {
			
 
				+		register char *fname = argv[n];
			
 
				+		register struct text *txt = &Text[n];
			
 
				+
			
 
				+		fprintf(OutputFile, "File %s: ", fname);
			
 
				+
			
 
				+		txt->tx_fname = fname;
			
 
				+		txt->tx_pos = 0;
			
 
				+		txt->tx_start =
			
 
				+		txt->tx_limit = TextLength();
			
 
				+		if (strcmp(fname, "/") == 0) {
			
 
				+			fprintf(OutputFile, "separator\n");
			
 
				+			NumberOfNewTexts = n;
			
 
				+		}
			
 
				+		else {
			
 
				+			if (!OpenText(First, txt)) {
			
 
				+				fprintf(OutputFile, ">>>> cannot open <<<< ");
			
 
				+				/*	the file has still been opened
			
 
				+					with a null file for uniformity
			
 
				+				*/
			
 
				+			}
			
 
				+			while (NextTextTokenObtained(First)) {
			
 
				+				if (!TOKEN_EQ(lex_token, EOL)) {
			
 
				+					StoreToken();
			
 
				+				}
			
 
				+			}
			
 
				+			CloseText(First, txt);
			
 
				+			txt->tx_limit = TextLength();
			
 
				+
			
 
				+			/* report */
			
 
				+			print_count(txt->tx_limit - txt->tx_start, "token");
			
 
				+			if (lex_non_ascii_cnt) {
			
 
				+				fprintf(DebugFile, ", ");
			
 
				+				print_count(lex_non_ascii_cnt,
			
 
				+					"non-ASCII character"
			
 
				+				);
			
 
				+			}
			
 
				+			fprintf(OutputFile, "\n");
			
 
				+#ifdef	DB_TEXT
			
 
				+			db_print_text(txt);
			
 
				+#endif	/* DB_TEXT */
			
 
				+		}
			
 
				+		fflush(OutputFile);
			
 
				+	}
			
 
				+
			
 
				+	/* report total */
			
 
				+	fprintf(OutputFile, "Total: ");
			
 
				+	print_count(TextLength() - 1, "token");
			
 
				+	fprintf(OutputFile, "\n\n");
			
 
				+	fflush(OutputFile);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+print_count(unsigned int cnt, const char *unit) {
			
 
				+	/*	Prints a grammatically correct string "%u %s[s]"
			
 
				+		for units that form their plural by suffixing -s.
			
 
				+	*/
			
 
				+	fprintf(OutputFile, "%u %s%s", cnt, unit, (cnt == 1 ? "" : "s"));
			
 
				+}
			
 
				+
			
 
				+#ifdef	DB_TEXT
			
 
				+
			
 
				+static void
			
 
				+db_print_text(const struct text *txt) {
			
 
				+	/* prints a text (in compressed form) */
			
 
				+	register int i;
			
 
				+
			
 
				+	fprintf(DebugFile, "\n\n**** DB_PRINT_TEXT ****\n");
			
 
				+
			
 
				+	fprintf(DebugFile, "File \"%s\", %u tokens, ",
			
 
				+		txt->tx_fname, txt->tx_limit - txt->tx_start
			
 
				+	);
			
 
				+	fprintf(DebugFile, "txt->tx_start = %u, txt->tx_limit = %u\n",
			
 
				+		txt->tx_start, txt->tx_limit
			
 
				+	);
			
 
				+
			
 
				+	for (i = txt->tx_start; i < txt->tx_limit; i++) {
			
 
				+		if ((i - txt->tx_start + 1) % 32 == 0) {
			
 
				+			fprintf(DebugFile, "\n");
			
 
				+		}
			
 
				+		print_token(stdout, TokenArray[i]);
			
 
				+	}
			
 
				+	fprintf(DebugFile, "\n");
			
 
				+}
			
 
				+
			
 
				+#endif	/* DB_TEXT */
			
--- a/utils/sim_pasc/pass1.h
+++ b/utils/sim_pasc/pass1.h
@@ -0,0 +1,9 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: pass1.h,v 1.3 2001/09/28 09:03:50 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Reads the input files; stores the tokens in TOKEN TokenArray[]
			
 
				+	and the input file descriptions in struct text text[].
			
 
				+*/
			
 
				+extern void Pass1(int argc, char *argv[]);
			
--- a/utils/sim_pasc/pass2.c
+++ b/utils/sim_pasc/pass2.c
@@ -0,0 +1,154 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: pass2.c,v 2.10 2004/08/05 09:49:46 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+
			
 
				+#include	"debug.par"
			
 
				+#include	"sim.h"
			
 
				+#include	"text.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"pass2.h"
			
 
				+
			
 
				+#ifdef	DB_POS
			
 
				+static void db_print_pos_list(const char *, const struct position *);
			
 
				+static void db_print_lex(const char *);
			
 
				+#endif
			
 
				+
			
 
				+static void pass2_txt(struct text *txt);
			
 
				+static int next_eol_obtained(void);
			
 
				+
			
 
				+void
			
 
				+Pass2(void) {
			
 
				+	int n;
			
 
				+
			
 
				+	for (n = 0; n < NumberOfTexts; n++) {
			
 
				+		pass2_txt(&Text[n]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* instantiate sort_pos_list() */
			
 
				+#define	SORT_STRUCT		position
			
 
				+#define	SORT_NAME		sort_pos_list
			
 
				+#define	SORT_BEFORE(p1,p2)	((p1)->ps_tk_cnt < (p2)->ps_tk_cnt)
			
 
				+#define	SORT_NEXT		ps_next
			
 
				+#include	"sortlist.bdy"
			
 
				+
			
 
				+static void
			
 
				+pass2_txt(struct text *txt) {
			
 
				+	register struct position *pos;
			
 
				+	register unsigned int old_nl_cnt;
			
 
				+
			
 
				+	if (!txt->tx_pos)	/* no need to scan the file */
			
 
				+		return;
			
 
				+
			
 
				+	if (!OpenText(Second, txt)) {
			
 
				+		fprintf(stderr, ">>>> File %s disappeared <<<<\n",
			
 
				+			txt->tx_fname
			
 
				+		);
			
 
				+	}
			
 
				+	/* sets lex_nl_cnt and lex_tk_cnt */
			
 
				+
			
 
				+#ifdef	DB_POS
			
 
				+	db_print_pos_list("before sorting", txt->tx_pos);
			
 
				+#endif	/* DB_POS */
			
 
				+
			
 
				+	sort_pos_list(&txt->tx_pos);
			
 
				+
			
 
				+#ifdef	DB_POS
			
 
				+	db_print_pos_list("after sorting", txt->tx_pos);
			
 
				+#endif	/* DB_POS */
			
 
				+
			
 
				+#ifdef	DB_NL_BUFF
			
 
				+	db_print_nl_buff(txt->tx_nl_start, txt->tx_nl_limit);
			
 
				+#endif	/* DB_NL_BUFF */
			
 
				+
			
 
				+	old_nl_cnt = 1;
			
 
				+	pos = txt->tx_pos;
			
 
				+	while (pos) {
			
 
				+		/* we scan the pos list and the file in parallel */
			
 
				+
			
 
				+		/* find the corresponding line */
			
 
				+		while (pos->ps_tk_cnt >= lex_tk_cnt) {
			
 
				+			/* pos does not refer to this line, try the next */
			
 
				+
			
 
				+			/* shift the administration */
			
 
				+			old_nl_cnt = lex_nl_cnt;
			
 
				+			/* and get the next eol position */
			
 
				+			if (!next_eol_obtained()) {
			
 
				+				/* ouch! not enough lines! */
			
 
				+				fprintf(stderr, ">>>> File %s modified <<<<\n",
			
 
				+					txt->tx_fname
			
 
				+				);
			
 
				+				break;
			
 
				+			}
			
 
				+#ifdef	DB_POS
			
 
				+			db_print_lex(txt->tx_fname);
			
 
				+#endif	/* DB_POS */
			
 
				+		}
			
 
				+
			
 
				+		/* fill in the pos */
			
 
				+		switch (pos->ps_type) {
			
 
				+		case 0:	/* first token of run */
			
 
				+			pos->ps_nl_cnt = old_nl_cnt;
			
 
				+			break;
			
 
				+		case 1:	/* last token of run */
			
 
				+			pos->ps_nl_cnt = lex_nl_cnt;
			
 
				+			break;
			
 
				+		}
			
 
				+		/* and get the next pos */
			
 
				+		pos = pos->ps_next;
			
 
				+	}
			
 
				+
			
 
				+#ifdef	DB_POS
			
 
				+	db_print_pos_list("after scanning", txt->tx_pos);
			
 
				+#endif	/* DB_POS */
			
 
				+
			
 
				+	CloseText(Second, txt);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+next_eol_obtained(void) {
			
 
				+	while (NextTextTokenObtained(Second)) {
			
 
				+		if (TOKEN_EQ(lex_token, EOL)) return 1;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#ifdef	DB_POS
			
 
				+
			
 
				+static void
			
 
				+db_print_pos(const struct position *pos) {
			
 
				+	fprintf(DebugFile, "pos type: %s; token count: %u",
			
 
				+		(pos->ps_type == 0 ? "first" : " last"),
			
 
				+		pos->ps_tk_cnt
			
 
				+	);
			
 
				+	fprintf(DebugFile, ", line#: ");
			
 
				+	if (pos->ps_nl_cnt == -1) {
			
 
				+		fprintf(DebugFile, "<NOT SET>");
			
 
				+	}
			
 
				+	else {
			
 
				+		fprintf(DebugFile, "%u", pos->ps_nl_cnt);
			
 
				+	}
			
 
				+	fprintf(DebugFile, "\n");
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+db_print_pos_list(const char *msg, const struct position *pos) {
			
 
				+	fprintf(DebugFile, "\n**** DB_PRINT_POS_LIST, %s ****\n", msg);
			
 
				+
			
 
				+	while (pos) {
			
 
				+		db_print_pos(pos);
			
 
				+		pos = pos->ps_next;
			
 
				+	}
			
 
				+	fprintf(DebugFile, "\n");
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+db_print_lex(const char *fn) {
			
 
				+	fprintf(DebugFile, "%s: lex_tk_cnt = %u, lex_nl_cnt = %u\n",
			
 
				+		fn, lex_tk_cnt, lex_nl_cnt);
			
 
				+}
			
 
				+
			
 
				+#endif	/* DB_POS */
			
--- a/utils/sim_pasc/pass2.h
+++ b/utils/sim_pasc/pass2.h
@@ -0,0 +1,9 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: pass2.h,v 1.2 1998/01/21 14:27:58 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Determines for each position that is part of a run, at which
			
 
				+	line number it starts and ends.
			
 
				+*/
			
 
				+extern void Pass2(void);
			
--- a/utils/sim_pasc/pass3.c
+++ b/utils/sim_pasc/pass3.c
@@ -0,0 +1,356 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: pass3.c,v 2.11 2005/02/20 17:03:03 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<string.h>
			
 
				+#include	<malloc.h>
			
 
				+
			
 
				+#include	"system.par"
			
 
				+#include	"debug.par"
			
 
				+#include	"sim.h"
			
 
				+#include	"runs.h"
			
 
				+#include	"error.h"
			
 
				+#include	"options.h"
			
 
				+#include	"pass3.h"
			
 
				+#include	"percentages.h"
			
 
				+
			
 
				+#ifdef	DB_RUN
			
 
				+#include	"tokenarray.h"
			
 
				+static void db_run(const struct run *);
			
 
				+#endif
			
 
				+
			
 
				+static FILE *open_chunk(const struct chunk *);
			
 
				+static void fill_line(FILE *, char []);
			
 
				+static void clear_line(char []);
			
 
				+static void show_runs(void);
			
 
				+static void show_run(const struct run *);
			
 
				+static void show_2C_line(const char [], const char []);
			
 
				+static void show_1C_line(FILE *, const char *);
			
 
				+static int prhead(const struct chunk *);
			
 
				+static int prs(const char *);
			
 
				+static int pru(unsigned int);
			
 
				+static int unslen(unsigned int);
			
 
				+
			
 
				+static int maxline;			/* Actual maximum line length */
			
 
				+static char *line0;			/* by malloc() */
			
 
				+static char *line1;
			
 
				+
			
 
				+void
			
 
				+Pass3(void) {
			
 
				+	if (option_set('p')) {
			
 
				+		show_percentages();
			
 
				+	}
			
 
				+	else {
			
 
				+		show_runs();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+show_runs(void) {
			
 
				+	AisoIter iter;
			
 
				+	struct run *run;
			
 
				+
			
 
				+	maxline = PageWidth / 2 - 2;
			
 
				+	line0 = malloc((unsigned int)((maxline + 1) * sizeof (char)));
			
 
				+	line1 = malloc((unsigned int)((maxline + 1) * sizeof (char)));
			
 
				+	if (!line0 || !line1) fatal("out of memory");
			
 
				+
			
 
				+	OpenIter(&iter);
			
 
				+	while (GetAisoItem(&iter, &run)) {
			
 
				+#ifdef	DB_RUN
			
 
				+		db_run(run);
			
 
				+#endif	/* DB_RUN */
			
 
				+		show_run(run);
			
 
				+		fprintf(OutputFile, "\n");
			
 
				+	}
			
 
				+	CloseIter(&iter);
			
 
				+
			
 
				+	free(line0); line0 = 0;
			
 
				+	free(line1); line1 = 0;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+show_run(const struct run *run) {
			
 
				+	/* The animals came in two by two ... */
			
 
				+	register const struct chunk *cnk0 = &run->rn_cn0;
			
 
				+	register const struct chunk *cnk1 = &run->rn_cn1;
			
 
				+	register unsigned int nl_cnt0 =
			
 
				+			cnk0->ch_last.ps_nl_cnt - cnk0->ch_first.ps_nl_cnt;
			
 
				+	register unsigned int nl_cnt1 =
			
 
				+			cnk1->ch_last.ps_nl_cnt - cnk1->ch_first.ps_nl_cnt;
			
 
				+	FILE *f0;
			
 
				+	FILE *f1;
			
 
				+
			
 
				+	/* display heading of chunk */
			
 
				+	if (!option_set('d')) {
			
 
				+		/* no assumptions about the lengths of the file names! */
			
 
				+		register unsigned int size = run->rn_size;
			
 
				+		register int pos = 0;
			
 
				+
			
 
				+		pos += prhead(cnk0);
			
 
				+		while (pos < maxline + 1) {
			
 
				+			pos += prs(" ");
			
 
				+		}
			
 
				+		pos += prs("|");
			
 
				+		pos += prhead(cnk1);
			
 
				+		while (pos < 2*maxline - unslen(size)) {
			
 
				+			pos += prs(" ");
			
 
				+		}
			
 
				+		fprintf(OutputFile, "[%u]\n", size);
			
 
				+	}
			
 
				+	else {
			
 
				+		(void)prhead(cnk0);
			
 
				+		fprintf(OutputFile, "\n");
			
 
				+		(void)prhead(cnk1);
			
 
				+		fprintf(OutputFile, "\n");
			
 
				+	}
			
 
				+
			
 
				+	/* stop if that suffices */
			
 
				+	if (option_set('n'))
			
 
				+		return;			/* ... had enough so soon ... */
			
 
				+
			
 
				+	/* open the files that hold the chunks */
			
 
				+	f0 = open_chunk(cnk0);
			
 
				+	f1 = open_chunk(cnk1);
			
 
				+
			
 
				+	/* display the chunks in the required format */
			
 
				+	if (!option_set('d')) {
			
 
				+		/* fill 2-column lines and print them */
			
 
				+		while (nl_cnt0 != 0 || nl_cnt1 != 0) {
			
 
				+			if (nl_cnt0) {
			
 
				+				fill_line(f0, line0);
			
 
				+				nl_cnt0--;
			
 
				+			}
			
 
				+			else {
			
 
				+				clear_line(line0);
			
 
				+			}
			
 
				+			if (nl_cnt1) {
			
 
				+				fill_line(f1, line1);
			
 
				+				nl_cnt1--;
			
 
				+			}
			
 
				+			else {
			
 
				+				clear_line(line1);
			
 
				+			}
			
 
				+			show_2C_line(line0, line1);
			
 
				+		}
			
 
				+	}
			
 
				+	else {
			
 
				+		/* display the lines in a diff(1)-like format */
			
 
				+		while (nl_cnt0--) {
			
 
				+			show_1C_line(f0, "<");
			
 
				+		}
			
 
				+		fprintf(OutputFile, "---\n");
			
 
				+		while (nl_cnt1--) {
			
 
				+			show_1C_line(f1, ">");
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* close the pertinent files */
			
 
				+	fclose(f0);
			
 
				+	fclose(f1);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+prhead(const struct chunk *cnk) {
			
 
				+	register int pos = 0;
			
 
				+
			
 
				+	pos += prs(cnk->ch_text->tx_fname);
			
 
				+	pos += prs(": line ");
			
 
				+	pos += pru(cnk->ch_first.ps_nl_cnt);
			
 
				+	pos += prs("-");
			
 
				+	pos += pru(cnk->ch_last.ps_nl_cnt - 1);
			
 
				+	return pos;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+prs(const char *str) {
			
 
				+	fprintf(OutputFile, "%s", str);
			
 
				+	return strlen(str);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+pru(unsigned int u) {
			
 
				+	fprintf(OutputFile, "%u", u);
			
 
				+	return unslen(u);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+unslen(unsigned int u) {
			
 
				+	register int res = 1;
			
 
				+
			
 
				+	while (u > 9) {
			
 
				+		u /= 10, res++;
			
 
				+	}
			
 
				+	return res;
			
 
				+}
			
 
				+
			
 
				+static FILE *
			
 
				+open_chunk(const struct chunk *cnk) {
			
 
				+	/*	opens the file in which the chunk resides, positions the
			
 
				+		file at the beginning of the chunk and returns the file pointer
			
 
				+	*/
			
 
				+	register char *fname = cnk->ch_text->tx_fname;
			
 
				+	register FILE *f = fopen(fname, "r");
			
 
				+	register unsigned int nl_cnt;
			
 
				+
			
 
				+	if (!f) {
			
 
				+		fprintf(stderr, ">>>> File %s disappeared <<<<\n", fname);
			
 
				+		f = fopen(NULLFILE, "r");
			
 
				+	}
			
 
				+
			
 
				+	nl_cnt = cnk->ch_first.ps_nl_cnt;
			
 
				+	while (nl_cnt > 1) {
			
 
				+		int ch = getc(f);
			
 
				+
			
 
				+		if (ch < 0) break;
			
 
				+		if (ch == '\n') {
			
 
				+			nl_cnt--;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return f;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+fill_line(FILE *f, char ln[]) {
			
 
				+	/*	Reads one line from f and puts it in condensed form in ln.
			
 
				+	*/
			
 
				+	register int indent = 0, lpos = 0;
			
 
				+	register int ch;
			
 
				+
			
 
				+	/* condense and skip initial blank */
			
 
				+	while ((ch = getc(f)), ch == ' ' || ch == '\t') {
			
 
				+		if (ch == '\t') {
			
 
				+			indent = 8;
			
 
				+		}
			
 
				+		else {
			
 
				+			indent++;
			
 
				+		}
			
 
				+		if (indent == 8) {
			
 
				+			/* every eight blanks give one blank */
			
 
				+			if (lpos < maxline) {
			
 
				+				ln[lpos++] = ' ';
			
 
				+			}
			
 
				+			indent = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* store the rest */
			
 
				+	while (ch >= 0 && ch != '\n') {
			
 
				+		if (ch == '\t') {
			
 
				+			/* replace tabs by blanks */
			
 
				+			ch = ' ';
			
 
				+		}
			
 
				+		if (lpos < maxline) {
			
 
				+			ln[lpos++] = ch;
			
 
				+		}
			
 
				+		ch = getc(f);
			
 
				+	}
			
 
				+	ln[lpos] = '\0';		/* always room for this one */
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+clear_line(char ln[]) {
			
 
				+	/* a simple null byte will suffice */
			
 
				+	ln[0] = '\0';
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+show_2C_line(const char ln0[], const char ln1[]) {
			
 
				+	/*	displays the contents of the two lines in a two-column
			
 
				+		format
			
 
				+	*/
			
 
				+	register int i;
			
 
				+
			
 
				+	for (i = 0; i < maxline && ln0[i] != '\0'; i++) {
			
 
				+		fputc(ln0[i], OutputFile);
			
 
				+	}
			
 
				+	for (; i < maxline; i++) {
			
 
				+		fputc(' ', OutputFile);
			
 
				+	}
			
 
				+	fprintf(OutputFile, " |");
			
 
				+
			
 
				+	for (i = 0; i < maxline && ln1[i] != '\0'; i++) {
			
 
				+		fputc(ln1[i], OutputFile);
			
 
				+	}
			
 
				+	fprintf(OutputFile, "\n");
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+show_1C_line(FILE *f, const char *marker) {
			
 
				+	/*	displays one line from f, preceded by the marker
			
 
				+	*/
			
 
				+	register int ch;
			
 
				+
			
 
				+	fprintf(OutputFile, "%s", marker);
			
 
				+	while ((ch = getc(f)), ch > 0 && ch != '\n') {
			
 
				+		fputc(ch, OutputFile);
			
 
				+	}
			
 
				+	fputc('\n', OutputFile);
			
 
				+}
			
 
				+
			
 
				+#ifdef	DB_RUN
			
 
				+
			
 
				+static void db_chunk(const struct chunk *);
			
 
				+
			
 
				+static void
			
 
				+db_run(const struct run *run) {
			
 
				+	/* prints detailed data about a run */
			
 
				+	register const struct chunk *cnk0 = &run->rn_cn0;
			
 
				+	register const struct chunk *cnk1 = &run->rn_cn1;
			
 
				+
			
 
				+	fprintf(DebugFile, "File %s / file %s:\n",
			
 
				+		cnk0->ch_text->tx_fname,
			
 
				+		cnk1->ch_text->tx_fname
			
 
				+	);
			
 
				+	fprintf(DebugFile, "from token %u/%u to %u/%u:",
			
 
				+		cnk0->ch_first.ps_tk_cnt, cnk1->ch_first.ps_tk_cnt,
			
 
				+		cnk0->ch_last.ps_tk_cnt, cnk1->ch_last.ps_tk_cnt
			
 
				+	);
			
 
				+	fprintf(DebugFile, " from lines %u/%u to %u/%u:",
			
 
				+		cnk0->ch_first.ps_nl_cnt, cnk1->ch_first.ps_nl_cnt,
			
 
				+		cnk0->ch_last.ps_nl_cnt, cnk1->ch_last.ps_nl_cnt
			
 
				+	);
			
 
				+	fprintf(DebugFile, " %u %s\n",
			
 
				+		run->rn_size,
			
 
				+		(run->rn_size == 1 ? "token" : "tokens")
			
 
				+	);
			
 
				+
			
 
				+	db_chunk(cnk0);
			
 
				+	db_chunk(cnk1);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+db_chunk(const struct chunk *cnk) {
			
 
				+	/*	print the tokens in the chunk, with a one-char margin
			
 
				+	*/
			
 
				+	unsigned int i;
			
 
				+	const struct position *first = &cnk->ch_first;
			
 
				+	const struct position *last = &cnk->ch_last;
			
 
				+	unsigned int start = cnk->ch_text->tx_start;
			
 
				+
			
 
				+	if (first->ps_tk_cnt > 0) {
			
 
				+		fprintf(DebugFile, "...");
			
 
				+		print_token(stdout, TokenArray[start + first->ps_tk_cnt - 1]);
			
 
				+		fprintf(DebugFile, "  ");
			
 
				+	}
			
 
				+	else {	/* create same offset as above */
			
 
				+		fprintf(DebugFile, "       ");
			
 
				+	}
			
 
				+
			
 
				+	for (i = first->ps_tk_cnt; i <= last->ps_tk_cnt; i++) {
			
 
				+		print_token(stdout, TokenArray[start + i]);
			
 
				+	}
			
 
				+
			
 
				+	if (start + last->ps_tk_cnt + 1 < cnk->ch_text->tx_limit) {
			
 
				+		fprintf(DebugFile, "  ");
			
 
				+		print_token(stdout, TokenArray[start + last->ps_tk_cnt + 1]);
			
 
				+		fprintf(DebugFile, "...");
			
 
				+	}
			
 
				+
			
 
				+	fprintf(DebugFile, "\n");
			
 
				+}
			
 
				+
			
 
				+#endif	/* DB_RUN */
			
--- a/utils/sim_pasc/pass3.h
+++ b/utils/sim_pasc/pass3.h
@@ -0,0 +1,7 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: pass3.h,v 1.2 1998/01/21 14:28:01 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Print the contents of runs */
			
 
				+extern void Pass3(void);
			
--- a/utils/sim_pasc/percentages.c
+++ b/utils/sim_pasc/percentages.c
@@ -0,0 +1,115 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: percentages.c,v 1.3 2007/08/27 09:57:33 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<malloc.h>
			
 
				+
			
 
				+#include	"sim.h"
			
 
				+#include	"runs.h"
			
 
				+#include	"error.h"
			
 
				+#include	"percentages.h"
			
 
				+
			
 
				+struct match {
			
 
				+	struct match *ma_next;
			
 
				+	struct text *ma_text0;
			
 
				+	struct text *ma_text1;
			
 
				+	unsigned int ma_size;
			
 
				+};
			
 
				+
			
 
				+static struct match *match_start;	/* to be allocated by malloc */
			
 
				+
			
 
				+int
			
 
				+add_to_percentages(struct run *r) {
			
 
				+	/* fails if out of memory, in line with add_to_run() */
			
 
				+	struct match **match_hook = &match_start;
			
 
				+
			
 
				+	/* percentages are only meaningful between different files */
			
 
				+	if (r->rn_cn0.ch_text == r->rn_cn1.ch_text) return 1;
			
 
				+
			
 
				+	/* look (text0, text1) combination up in match list */
			
 
				+	while (*match_hook) {
			
 
				+		struct match *m = *match_hook;
			
 
				+
			
 
				+		if (	m->ma_text0 == r->rn_cn0.ch_text
			
 
				+		&&	m->ma_text1 == r->rn_cn1.ch_text
			
 
				+		) {
			
 
				+			/* found it; now update it */
			
 
				+			m->ma_size += r->rn_size;
			
 
				+			return 1;
			
 
				+		}
			
 
				+		match_hook = &m->ma_next;
			
 
				+	}
			
 
				+
			
 
				+	{	/* it's not there; make a new entry */
			
 
				+		struct match *m = *match_hook =
			
 
				+			(struct match *)malloc(sizeof (struct match));
			
 
				+
			
 
				+		if (m == 0) return 0;
			
 
				+		m->ma_next = 0;
			
 
				+		m->ma_text0 = r->rn_cn0.ch_text;
			
 
				+		m->ma_text1 = r->rn_cn1.ch_text;
			
 
				+		m->ma_size = r->rn_size;
			
 
				+		return 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+add_reverse_entries_to_match_list(void) {
			
 
				+	struct match **match_hook = &match_start;
			
 
				+
			
 
				+	while (*match_hook) {
			
 
				+		struct match *m = *match_hook;
			
 
				+		struct match *n =
			
 
				+			(struct match *)malloc(sizeof (struct match));
			
 
				+
			
 
				+		if (!n) fatal("out of memory");
			
 
				+		/* hook in the double */
			
 
				+		n->ma_next = m->ma_next;
			
 
				+		m->ma_next = n;
			
 
				+		n->ma_text0 = m->ma_text1;
			
 
				+		n->ma_text1 = m->ma_text0;
			
 
				+		n->ma_size = m->ma_size;
			
 
				+
			
 
				+		match_hook = &n->ma_next;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static float
			
 
				+match_percentage(struct match *m) {
			
 
				+	struct text *text0 = m->ma_text0;
			
 
				+	int size0 = text0->tx_limit - text0->tx_start;
			
 
				+
			
 
				+	return (m->ma_size*1.0/size0);
			
 
				+}
			
 
				+
			
 
				+/* instantiate sort_match_list() */
			
 
				+#define	SORT_STRUCT		match
			
 
				+#define	SORT_NAME		sort_match_list
			
 
				+#define	SORT_BEFORE(p1,p2)	(match_percentage(p1) > match_percentage(p2))
			
 
				+#define	SORT_NEXT		ma_next
			
 
				+#include	"sortlist.bdy"
			
 
				+
			
 
				+static void
			
 
				+print_percentages(void) {
			
 
				+	struct match *m = match_start;
			
 
				+
			
 
				+	while (m) {
			
 
				+		fprintf(OutputFile,
			
 
				+			"%s consists for %d %% of %s material\n",
			
 
				+			m->ma_text0->tx_fname,
			
 
				+			(int)(match_percentage(m)*100.0),
			
 
				+			m->ma_text1->tx_fname
			
 
				+		);
			
 
				+		
			
 
				+		m = m->ma_next;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+show_percentages(void) {
			
 
				+	add_reverse_entries_to_match_list();
			
 
				+	sort_match_list(&match_start);
			
 
				+	print_percentages();
			
 
				+}
			
--- a/utils/sim_pasc/percentages.h
+++ b/utils/sim_pasc/percentages.h
@@ -0,0 +1,7 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: percentages.h,v 1.2 2004/08/05 09:49:48 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+extern int add_to_percentages(struct run *r);
			
 
				+extern void show_percentages(void);
			
--- a/utils/sim_pasc/runs.c
+++ b/utils/sim_pasc/runs.c
@@ -0,0 +1,11 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: runs.c,v 1.2 2001/11/08 12:30:30 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	"sim.h"
			
 
				+#include	"runs.h"
			
 
				+
			
 
				+#define	AISO_BEFORE(r0,r1)	((r0)->rn_size > (r1)->rn_size)
			
 
				+
			
 
				+#include	"aiso.bdy"
			
--- a/utils/sim_pasc/runs.h
+++ b/utils/sim_pasc/runs.h
@@ -0,0 +1,33 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: runs.h,v 1.2 2001/11/08 12:30:30 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Although all other segments of data in this program are described by
			
 
				+	giving the position of the first in the segment and that of the
			
 
				+	first not in the segment (so the size is the difference of the two),
			
 
				+	a `chunk' is given by first and last. This is done because later on we
			
 
				+	are interested in the actual position of the last token of it, and
			
 
				+	the position of the first token not in the segment gives no
			
 
				+	indication about that.
			
 
				+*/
			
 
				+
			
 
				+struct chunk {
			
 
				+	/* a chunk of text in various representations */
			
 
				+	struct text *ch_text;		/* pointer to the file */
			
 
				+	struct position ch_first;	/* first in chunk */
			
 
				+	struct position ch_last;	/* last in chunk */
			
 
				+};
			
 
				+
			
 
				+struct run {				/* a 'run' of coincident tokens */
			
 
				+	struct chunk rn_cn0;		/* chunk in left file */
			
 
				+	struct chunk rn_cn1;		/* chunk in right file */
			
 
				+	unsigned int rn_size;
			
 
				+};
			
 
				+
			
 
				+#define	AISO_TYPE	struct run *
			
 
				+#define	AISO_ITERATOR
			
 
				+
			
 
				+#define	add_to_runs(r)	InsertAiso(r)
			
 
				+
			
 
				+#include	"aiso.spc"
			
--- a/utils/sim_pasc/settings.par
+++ b/utils/sim_pasc/settings.par
@@ -0,0 +1,8 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: settings.par,v 1.1 1997/06/20 12:03:22 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#define	DFLT_MIN_RUN_SIZE	24	/* default minimum run size */
			
 
				+
			
 
				+#define	DFLT_PAGE_WIDTH		80	/* default page width */
			
--- a/utils/sim_pasc/sim.1
+++ b/utils/sim_pasc/sim.1
@@ -0,0 +1,176 @@
 
				+.\"	This file is part of the software similarity tester SIM.
			
 
				+.\"	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+.\"	$Id: sim.1,v 2.6 2004/08/05 09:49:49 dick Exp $
			
 
				+.\"
			
 
				+.TH SIM 1 2001/11/13 "Vrije Universiteit"
			
 
				+.SH NAME
			
 
				+sim \- find similarities in C, Java, Pascal, Modula-2, Lisp, Miranda or text files
			
 
				+.SH SYNOPSIS
			
 
				+.B sim_c
			
 
				+[
			
 
				+.B \-[defFnpsS]
			
 
				+.B \-r
			
 
				+.I N
			
 
				+.B \-w
			
 
				+.I N
			
 
				+.B \-o
			
 
				+.I F
			
 
				+]
			
 
				+file ... [
			
 
				+.B /
			
 
				+[ file ... ] ]
			
 
				+.br
			
 
				+.B sim_c
			
 
				+\&...
			
 
				+.br
			
 
				+.B sim_java
			
 
				+\&...
			
 
				+.br
			
 
				+.B sim_pasc
			
 
				+\&...
			
 
				+.br
			
 
				+.B sim_m2
			
 
				+\&...
			
 
				+.br
			
 
				+.B sim_lisp
			
 
				+\&...
			
 
				+.br
			
 
				+.B sim_mira
			
 
				+\&...
			
 
				+.br
			
 
				+.B sim_text
			
 
				+\&...
			
 
				+.br
			
 
				+.SH DESCRIPTION
			
 
				+.I Sim_c
			
 
				+reads the C files
			
 
				+.I file ...
			
 
				+and looks for pieces of text that are similar; two pieces of program text
			
 
				+are similar if they only differ in layout, comment, identifiers and
			
 
				+the contents of numbers, strings and characters.
			
 
				+If any runs of sufficient length
			
 
				+are found, they are reported on standard output; the number of significant
			
 
				+tokens in the run is given between square brackets.
			
 
				+.PP
			
 
				+.I Sim_java
			
 
				+does the same for Java,
			
 
				+.I sim_pasc
			
 
				+for Pascal,
			
 
				+.I sim_m2
			
 
				+for Modula-2,
			
 
				+.I sim_lisp
			
 
				+for Lisp, and
			
 
				+.I sim_mira
			
 
				+for Miranda.
			
 
				+.I Sim_text
			
 
				+works on arbitrary text; it is occasionally useful on shell scripts.
			
 
				+.PP
			
 
				+The program can be used for finding copied pieces of code in
			
 
				+purportedly unrelated programs (with
			
 
				+.B \-s
			
 
				+or
			
 
				+.BR \-S ),
			
 
				+or for finding accidentally duplicated code in larger projects (with
			
 
				+.BR \-f ).
			
 
				+.PP
			
 
				+If a
			
 
				+.B /
			
 
				+is present between the input files, the latter are divided into a group of
			
 
				+"new" files (before the
			
 
				+.BR / )
			
 
				+and a group of "old" files; if there is no
			
 
				+.BR / ,
			
 
				+all files are "new".
			
 
				+Old files are never compared to each other.
			
 
				+Since the similarity tester
			
 
				+reads the files several times, it cannot read from standard input.
			
 
				+.PP
			
 
				+There are the following options:
			
 
				+.TP
			
 
				+.B \-d
			
 
				+The output is in a diff(1)-like format instead of the default
			
 
				+2-column format.
			
 
				+.TP
			
 
				+.B \-e
			
 
				+Each file is compared to each file in isolation; this will find all
			
 
				+similarities between all texts involved, regardless of duplicates.
			
 
				+.TP
			
 
				+.B \-f
			
 
				+Runs are restricted to pieces with balancing parentheses, to isolate
			
 
				+potential functions (C, Java, Pascal, Modula-2 and Lisp only).
			
 
				+.TP
			
 
				+.B \-F
			
 
				+The names of functions in calls are required to match exactly
			
 
				+(C, Java, Pascal, Modula-2 and Lisp only).
			
 
				+.TP
			
 
				+.B \-n
			
 
				+Similarities found are only summarized, not displayed.
			
 
				+.TP
			
 
				+.B "\-o F"
			
 
				+The output is written to the file named
			
 
				+.I F.
			
 
				+.TP
			
 
				+.B \-p
			
 
				+The output is given in similarity percentages; see below.
			
 
				+.TP
			
 
				+.B "\-r N"
			
 
				+The minimum run length is set to
			
 
				+.I N
			
 
				+(default is
			
 
				+.I N
			
 
				+= 24).
			
 
				+.TP
			
 
				+.B \-s
			
 
				+The contents of a file are not compared to itself (\-s = not self).
			
 
				+.TP
			
 
				+.B \-S
			
 
				+The contents of the new files are compared to the old files only \- not
			
 
				+between themselves.
			
 
				+.TP
			
 
				+.B "\-w N"
			
 
				+The page width used is set to
			
 
				+.I N
			
 
				+columns (default is
			
 
				+.I N
			
 
				+= 80).
			
 
				+.PP
			
 
				+The
			
 
				+.B \-p
			
 
				+option results in lines of the form
			
 
				+.DS
			
 
				+.ft 5
			
 
				+F consists for x % of G material
			
 
				+.ft P
			
 
				+.DE
			
 
				+meaning that \f5x\fP % of \f5F\fP's text can also be found in \f5G\fP.
			
 
				+Note that this relation is not symmetric; it is in fact quite possible for one
			
 
				+file to consist for 100 % of text from another file, while the other file
			
 
				+consists for only 1 % of text of the first file, if their lengths differ
			
 
				+enough.
			
 
				+Note also that the granularity of the recognized text is still governed by the
			
 
				+.B \-r
			
 
				+option or its default.
			
 
				+.PP
			
 
				+Care has been taken to keep all internal processes linear in the length of the
			
 
				+input, with the exception of the matching process which is almost linear,
			
 
				+using a hash table; various other tables are used for speed-up.
			
 
				+If, however, there is not enough memory for the tables, they are discarded in
			
 
				+order of unimportance, under which conditions the algorithms revert to their
			
 
				+quadratic nature.
			
 
				+.SH AUTHOR
			
 
				+Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+.SH BUGS
			
 
				+Strong periodicity in the input text (like a table of
			
 
				+.I N
			
 
				+almost identical lines) causes problems.
			
 
				+.I Sim
			
 
				+tries to cope with this but cannot avoid giving appr.\&
			
 
				+.I log N
			
 
				+messages about it.
			
 
				+The best advice is still to take the offending files out of the game.
			
 
				+.PP
			
 
				+Since it uses
			
 
				+.I lex(1)
			
 
				+on some systems, it may dump core on any weird construction that overflows
			
 
				+.IR lex 's
			
 
				+internal buffers.
			
--- a/utils/sim_pasc/sim.c
+++ b/utils/sim_pasc/sim.c
@@ -0,0 +1,149 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: sim.c,v 2.12 2007/08/27 09:57:34 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<stdlib.h>
			
 
				+
			
 
				+#include	"settings.par"
			
 
				+#include	"sim.h"
			
 
				+#include	"options.h"
			
 
				+#include	"language.h"
			
 
				+#include	"error.h"
			
 
				+#include	"hash.h"
			
 
				+#include	"compare.h"
			
 
				+#include	"pass1.h"
			
 
				+#include	"pass2.h"
			
 
				+#include	"pass3.h"
			
 
				+#include	"stream.h"
			
 
				+#include	"lex.h"
			
 
				+
			
 
				+unsigned int MinRunSize = DFLT_MIN_RUN_SIZE;
			
 
				+int PageWidth = DFLT_PAGE_WIDTH;
			
 
				+FILE *OutputFile;
			
 
				+FILE *DebugFile;
			
 
				+
			
 
				+struct text *Text;			/* to be filled in by malloc */
			
 
				+int NumberOfTexts;			/* number of text records */
			
 
				+int NumberOfNewTexts;			/* number of new text records */
			
 
				+
			
 
				+char *progname;				/* for error reporting */
			
 
				+
			
 
				+static const char *outputname;		/* for reporting */
			
 
				+static const char *minrunstring;
			
 
				+static const char *pagewidthstring;
			
 
				+
			
 
				+static const struct option optlist[] = {
			
 
				+	{'r', "minimum run size", 'N', &minrunstring},
			
 
				+	{'w', "page width", 'N', &pagewidthstring},
			
 
				+	{'f', "function-like forms only", ' ', 0},
			
 
				+	{'d', "use diff format for output", ' ', 0},
			
 
				+	{'p', "use percentage format for output", ' ', 0},
			
 
				+	{'e', "compare each file to each file separately", ' ', 0},
			
 
				+	{'s', "do not compare a file to itself", ' ', 0},
			
 
				+	{'S', "compare new files to old files only", ' ', 0},
			
 
				+	{'F', "keep function identifiers in tact", ' ', 0},
			
 
				+	{'n', "display headings only", ' ', 0},
			
 
				+	{'x', "no pass2 nl_buff allocation", ' ', 0},
			
 
				+	{'o', "write output to file F", 'F', &outputname},
			
 
				+	{'-', "lexical scan output only", ' ', 0},
			
 
				+	{0, 0, 0, 0}
			
 
				+};
			
 
				+
			
 
				+static void print_stream(const char *fname);
			
 
				+
			
 
				+int
			
 
				+main(int argc, char *argv[]) {
			
 
				+	progname = argv[0];		/* save program name */
			
 
				+	argv++, argc--;			/* and skip it */
			
 
				+
			
 
				+	/* Set the default output and debug streams */
			
 
				+	OutputFile = stdout;
			
 
				+	DebugFile = stdout;
			
 
				+
			
 
				+	/* Get command line options */
			
 
				+	{	int nop = do_options(progname, optlist, argc, argv);
			
 
				+		argc -= nop, argv += nop;	/* skip them */
			
 
				+	}
			
 
				+
			
 
				+	/* Treat the value options */
			
 
				+	if (minrunstring) {
			
 
				+		MinRunSize = strtoul(minrunstring, NULL, 10);
			
 
				+		if (MinRunSize == 0) fatal("bad or zero run size; form is: -r N");
			
 
				+	}
			
 
				+	if (pagewidthstring) {
			
 
				+		PageWidth = atoi(pagewidthstring);
			
 
				+		if (PageWidth == 0) fatal("bad or zero page width; form is: -w N");
			
 
				+	}
			
 
				+	if (outputname) {
			
 
				+		OutputFile = fopen(outputname, "w");
			
 
				+		if (OutputFile == 0) {
			
 
				+			char msg[500];
			
 
				+
			
 
				+			sprintf(msg, "cannot open output file %s", outputname);
			
 
				+			fatal(msg);
			
 
				+			/*NOTREACHED*/
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (option_set('-')) {
			
 
				+		/* it is the lexical scan only */
			
 
				+		while (argv[0]) {
			
 
				+			print_stream(argv[0]);
			
 
				+			argv++;
			
 
				+		}
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* Start processing */
			
 
				+	InitLanguage();
			
 
				+
			
 
				+	/* Read the input files */
			
 
				+	Pass1(argc, argv);
			
 
				+
			
 
				+	/* Set up the forward reference table */
			
 
				+	MakeForwardReferences();
			
 
				+
			
 
				+	/* Compare the input files to find runs */
			
 
				+	Compare();
			
 
				+
			
 
				+	/* Delete forward reference table */
			
 
				+	FreeForwardReferences();
			
 
				+
			
 
				+	/* Find positions of the runs found */
			
 
				+	Pass2();
			
 
				+
			
 
				+	/* Print the similarities */
			
 
				+	Pass3();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+print_stream(const char *fname) {
			
 
				+	fprintf(OutputFile, "File %s:", fname);
			
 
				+	if (!OpenStream(fname)) {
			
 
				+		fprintf(OutputFile, " cannot open\n");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	fprintf(OutputFile, " showing token stream:\nnl_cnt, tk_cnt: tokens");
			
 
				+
			
 
				+	lex_token = EOL;
			
 
				+	do {
			
 
				+		if (TOKEN_EQ(lex_token, EOL)) {
			
 
				+			fprintf(OutputFile, "\n%u,%u:",
			
 
				+				lex_nl_cnt, lex_tk_cnt
			
 
				+			);
			
 
				+		}
			
 
				+		else {
			
 
				+			print_token(OutputFile, lex_token);
			
 
				+		}
			
 
				+	} while (NextStreamTokenObtained());
			
 
				+
			
 
				+	fprintf(OutputFile, "\n");
			
 
				+
			
 
				+	CloseStream();
			
 
				+
			
 
				+}
			
--- a/utils/sim_pasc/sim.h
+++ b/utils/sim_pasc/sim.h
@@ -0,0 +1,39 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: sim.h,v 2.7 2005/02/20 17:03:03 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+
			
 
				+struct position {
			
 
				+	/* position of first and last token of a chunk */
			
 
				+	struct position *ps_next;
			
 
				+	int ps_type;		/* first = 0, last = 1 */
			
 
				+	unsigned int ps_tk_cnt;	/* in tokens; set by add_run() in Pass 1 */
			
 
				+	unsigned int ps_nl_cnt;	/* same, in line numbers; set by Pass2(),
			
 
				+				   used by Pass3() to report line numbers
			
 
				+				*/
			
 
				+};
			
 
				+
			
 
				+struct text {
			
 
				+	char *tx_fname;		/* the file name */
			
 
				+	struct position *tx_pos;/* list of positions in this file that are
			
 
				+				   part of a chunk; sorted and updated by
			
 
				+				   Pass 2
			
 
				+				*/
			
 
				+	unsigned int tx_start;	/* positions in TokenArray[] for the text */
			
 
				+	unsigned int tx_limit;
			
 
				+	unsigned int tx_nl_start;/* possibly newline pointer for pass2 */
			
 
				+	unsigned int tx_nl_limit;
			
 
				+};
			
 
				+
			
 
				+extern unsigned int MinRunSize;
			
 
				+extern int PageWidth;
			
 
				+extern FILE *OutputFile;
			
 
				+extern FILE *DebugFile;
			
 
				+
			
 
				+extern struct text *Text;		/* Text[], one for each input file */
			
 
				+extern int NumberOfTexts;		/* number of text records */
			
 
				+extern int NumberOfNewTexts;		/* number of new text records */
			
 
				+
			
 
				+extern char *progname;			/* for error reporting */
			
--- a/utils/sim_pasc/sim.html
+++ b/utils/sim_pasc/sim.html
@@ -0,0 +1,116 @@
 
				+<HTML>
			
 
				+<!-- $Id: sim.html,v 1.7 2007/08/27 09:57:35 dick Exp $ -->
			
 
				+<HEAD>
			
 
				+<TITLE>The software and text similarity tester SIM</TITLE>
			
 
				+</HEAD>
			
 
				+
			
 
				+<BODY>
			
 
				+<H1>The software and text similarity tester SIM</H1>
			
 
				+
			
 
				+<H2>
			
 
				+<A HREF="http://www.cs.vu.nl/~dick/">Dick Grune</A>
			
 
				+</H2>
			
 
				+
			
 
				+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/README.1st">SIM</A>
			
 
				+tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp, Miranda,
			
 
				+and natural language.
			
 
				+It is used
			
 
				+<UL>
			
 
				+
			
 
				+<LI>
			
 
				+to detect potentially duplicated code fragments in large software
			
 
				+projects, in program text, in shell scripts and in documentation
			
 
				+</LI>
			
 
				+
			
 
				+<LI>
			
 
				+to detect plagiarism in software projects, educational and otherwise
			
 
				+</LI>
			
 
				+
			
 
				+</UL>
			
 
				+
			
 
				+<P>
			
 
				+SIM 2.19 is available as
			
 
				+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim_2_19.shar">
			
 
				+C sources</A>
			
 
				+and as
			
 
				+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim_2_19.zip">
			
 
				+MSDOS binaries</A>.
			
 
				+It is also available through ftp; the directory is
			
 
				+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester">
			
 
				+ftp.cs.vu.nl:/pub/dick/similarity_tester</A>.
			
 
				+There is a
			
 
				+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim.pdf">
			
 
				+Unix-style manual page</A>.
			
 
				+</P>
			
 
				+
			
 
				+<P>
			
 
				+The software similarity tester is very efficient and allows us to compare
			
 
				+this year's students' work with that collected from many past years (much to
			
 
				+the dismay of some, mostly non-CS, students).
			
 
				+Students are told that their work is going to be compared, but some are
			
 
				+non-believers ...
			
 
				+</P>
			
 
				+
			
 
				+<P>
			
 
				+The output of the similarity tester can be processed by a number of shell
			
 
				+scripts by Matty Huntjens
			
 
				+(<A HREF="http://www.cs.vu.nl/~matty/">[email protected]</A>).
			
 
				+These shell scripts take sim output and produce lists of suspect submissions,
			
 
				+histograms and the like.
			
 
				+The present version of these scripts is very much geared to the local
			
 
				+situation at the
			
 
				+<A HREF="http://www.vu.nl/">VU University Amsterdam</A>,
			
 
				+though; they are low on portability.
			
 
				+</P>
			
 
				+
			
 
				+<P>
			
 
				+We are not afraid that students would try to tune their work to the
			
 
				+similarity tester.
			
 
				+We reckon if they can do that they can also do the exercise.
			
 
				+</P>
			
 
				+
			
 
				+<P>
			
 
				+Since this piece of handicraft does not qualify as research, there are no
			
 
				+international papers on it.
			
 
				+The work was described in Dutch in
			
 
				+Dick Grune,
			
 
				+Matty Huntjens,
			
 
				+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/publications/Het_detecteren_van_kopieen_bij_informatica-practica.ps">
			
 
				+Het detecteren van kopie&euml;n bij informatica-practica</A>,
			
 
				+Informatie,
			
 
				+<STRONG>31</STRONG>,
			
 
				+11,
			
 
				+Nov 1989,
			
 
				+pp. 864-867
			
 
				+(<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/artikel.lit">
			
 
				+lit. ref.</A>)).
			
 
				+An
			
 
				+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/Paper.ps">
			
 
				+English translation
			
 
				+</A>
			
 
				+of the paper is also available.
			
 
				+The ftp directory contains a terse
			
 
				+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/TechnReport">
			
 
				+technical report</A>
			
 
				+about the internal workings of the program.
			
 
				+</P>
			
 
				+
			
 
				+<H5>
			
 
				+<HR>
			
 
				+[<A HREF="CVS.html">Previous</A>]
			
 
				+[<A HREF="mag.html">Next</A>]
			
 
				+[<A HREF="http://www.cs.vu.nl/~dick/dick.html">Personal Page</A>]
			
 
				+[<A HREF="http://www.cs.vu.nl/~dick/">Professional Page</A>]
			
 
				+[<A HREF="http://www.cs.vu.nl/">CS</A>]
			
 
				+[<A HREF="http://www.few.vu.nl/">Faculty</A>]
			
 
				+[<A HREF="http://www.vu.nl/">VU University Amsterdam</A>]
			
 
				+<HR>
			
 
				+</H5>
			
 
				+
			
 
				+<ADDRESS>
			
 
				+The software and text similarity tester SIM / Dick Grune /
			
 
				+<A HREF="mailto:[email protected]">[email protected]</A>
			
 
				+</ADDRESS>
			
 
				+
			
 
				+</BODY>
			
 
				+</HTML>
			
--- a/utils/sim_pasc/sim.txt
+++ b/utils/sim_pasc/sim.txt
@@ -0,0 +1,198 @@
 
				+
			
 
				+
			
 
				+
			
 
				+User Commands                                              SIM(1)
			
 
				+
			
 
				+
			
 
				+
			
 
				+NAME
			
 
				+     sim - find similarities in C, Java, Pascal, Modula-2,  Lisp,
			
 
				+     Miranda or text files
			
 
				+
			
 
				+SYNOPSIS
			
 
				+     sim_c [ -[defFnpsS] -r N -w N -o F ] file ... [ / [ file ...
			
 
				+     ] ]
			
 
				+     sim_c ...
			
 
				+     sim_java ...
			
 
				+     sim_pasc ...
			
 
				+     sim_m2 ...
			
 
				+     sim_lisp ...
			
 
				+     sim_mira ...
			
 
				+     sim_text ...
			
 
				+
			
 
				+DESCRIPTION
			
 
				+     Sim_c reads the C files file ... and  looks  for  pieces  of
			
 
				+     text  that are similar; two pieces of program text are simi-
			
 
				+     lar if they only differ in layout, comment, identifiers  and
			
 
				+     the  contents  of  numbers,  strings and characters.  If any
			
 
				+     runs of sufficient length are found, they  are  reported  on
			
 
				+     standard output; the number of significant tokens in the run
			
 
				+     is given between square brackets.
			
 
				+
			
 
				+     Sim_java does the same for Java, sim_pasc for Pascal, sim_m2
			
 
				+     for  Modula-2,  sim_lisp for Lisp, and sim_mira for Miranda.
			
 
				+     Sim_text works on arbitrary text; it is occasionally  useful
			
 
				+     on shell scripts.
			
 
				+
			
 
				+     The program can be used for finding copied pieces of code in
			
 
				+     purportedly unrelated programs (with -s or -S), or for find-
			
 
				+     ing accidentally duplicated code in  larger  projects  (with
			
 
				+     -f).
			
 
				+
			
 
				+     If a / is present between the input files,  the  latter  are
			
 
				+     divided  into  a  group  of "new" files (before the /) and a
			
 
				+     group of "old" files; if there is no /, all files are "new".
			
 
				+     Old files are never compared to each other.  Since the simi-
			
 
				+     larity tester reads the files several times, it cannot  read
			
 
				+     from standard input.
			
 
				+
			
 
				+     There are the following options:
			
 
				+
			
 
				+     -d   The output is in a diff(1)-like format instead  of  the
			
 
				+          default 2-column format.
			
 
				+
			
 
				+     -e   Each file is compared to each file in  isolation;  this
			
 
				+          will  find all similarities between all texts involved,
			
 
				+          regardless of duplicates.
			
 
				+
			
 
				+     -f   Runs  are   restricted   to   pieces   with   balancing
			
 
				+          parentheses,  to  isolate potential functions (C, Java,
			
 
				+
			
 
				+
			
 
				+
			
 
				+Vrije Universiteit   Last change: 2001/11/13                    1
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+User Commands                                              SIM(1)
			
 
				+
			
 
				+
			
 
				+
			
 
				+          Pascal, Modula-2 and Lisp only).
			
 
				+
			
 
				+     -F   The names of functions in calls are required  to  match
			
 
				+          exactly (C, Java, Pascal, Modula-2 and Lisp only).
			
 
				+
			
 
				+     -n   Similarities found are only summarized, not displayed.
			
 
				+
			
 
				+     -o F The output is written to the file named F.
			
 
				+
			
 
				+     -p   The output is  given  in  similarity  percentages;  see
			
 
				+          below.
			
 
				+
			
 
				+     -r N The minimum run length is set to N (default is N = 24).
			
 
				+
			
 
				+     -s   The contents of a file are not compared to itself (-s =
			
 
				+          not self).
			
 
				+
			
 
				+     -S   The contents of the new files are compared to  the  old
			
 
				+          files only - not between themselves.
			
 
				+
			
 
				+     -w N The page width used is set to N columns (default is N =
			
 
				+          80).
			
 
				+
			
 
				+     The -p option results in lines of the form F consists for  x
			
 
				+     %  of  G  material  meaning that x % of F's text can also be
			
 
				+     found in G.  Note that this relation is not symmetric; it is
			
 
				+     in  fact quite possible for one file to consist for 100 % of
			
 
				+     text from another file, while the other  file  consists  for
			
 
				+     only  1 % of text of the first file, if their lengths differ
			
 
				+     enough.  Note also that the granularity  of  the  recognized
			
 
				+     text is still governed by the -r option or its default.
			
 
				+
			
 
				+     Care has been taken to keep all internal processes linear in
			
 
				+     the  length of the input, with the exception of the matching
			
 
				+     process which is almost linear, using a hash table;  various
			
 
				+     other  tables  are used for speed-up.  If, however, there is
			
 
				+     not enough memory for the  tables,  they  are  discarded  in
			
 
				+     order of unimportance, under which conditions the algorithms
			
 
				+     revert to their quadratic nature.
			
 
				+
			
 
				+AUTHOR
			
 
				+     Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+
			
 
				+BUGS
			
 
				+     Strong periodicity in the input text  (like  a  table  of  N
			
 
				+     almost  identical lines) causes problems.  Sim tries to cope
			
 
				+     with this but cannot avoid giving appr. log N messages about
			
 
				+     it.   The  best  advice is still to take the offending files
			
 
				+     out of the game.
			
 
				+
			
 
				+     Since it uses lex(1) on some systems, it may  dump  core  on
			
 
				+     any   weird   construction  that  overflows  lex's  internal
			
 
				+
			
 
				+
			
 
				+
			
 
				+Vrije Universiteit   Last change: 2001/11/13                    2
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+User Commands                                              SIM(1)
			
 
				+
			
 
				+
			
 
				+
			
 
				+     buffers.
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+Vrije Universiteit   Last change: 2001/11/13                    3
			
 
				+
			
 
				+
			
 
				+
			
--- a/utils/sim_pasc/sortlist.bdy
+++ b/utils/sim_pasc/sortlist.bdy
@@ -0,0 +1,57 @@
 
				+/*
			
 
				+	Module:	Sort Linked Lists
			
 
				+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
			
 
				+	Version:	Tue Sep 17 17:32:33 1991
			
 
				+
			
 
				+Description:
			
 
				+	This is the implementation part of a generic routine that sorts
			
 
				+	linked lists.
			
 
				+
			
 
				+Instantiation:
			
 
				+	See sortlist.spc
			
 
				+*/
			
 
				+
			
 
				+#ifndef	_SORT_EXTERN_DEFINED
			
 
				+static
			
 
				+#endif
			
 
				+void
			
 
				+SORT_NAME(struct SORT_STRUCT **lh) {
			
 
				+	/*	I've  never known that sorting a linked list was this
			
 
				+		complicated; what am I missing?
			
 
				+	*/
			
 
				+	register struct SORT_STRUCT **listhook = lh;
			
 
				+
			
 
				+	while (*listhook) {
			
 
				+		/* 0. the list is not empty -> there must be a smallest one */
			
 
				+		register struct SORT_STRUCT **hsmall;
			
 
				+
			
 
				+		/* 1. find (the pointer to) the smallest element */
			
 
				+		{
			
 
				+			register struct SORT_STRUCT **hook = listhook;
			
 
				+
			
 
				+			/* assume initially that first element is smallest */
			
 
				+			hsmall = hook;
			
 
				+			while (*hook) {
			
 
				+				if (SORT_BEFORE(*hook, *hsmall)) {
			
 
				+					/* revise opinion */
			
 
				+					hsmall = hook;
			
 
				+				}
			
 
				+				hook = &(*hook)->SORT_NEXT;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/* 2. move the smallest element to front */
			
 
				+		{
			
 
				+			register struct SORT_STRUCT *smallest = *hsmall;
			
 
				+
			
 
				+			/* remove it from the chain */
			
 
				+			*hsmall = smallest->SORT_NEXT;
			
 
				+			/* and insert it before the first element */
			
 
				+			smallest->SORT_NEXT = *listhook;
			
 
				+			*listhook = smallest;
			
 
				+		}
			
 
				+
			
 
				+		/* 3. skip over smallest element */
			
 
				+		listhook = &(*listhook)->SORT_NEXT;
			
 
				+	}
			
 
				+}
			
--- a/utils/sim_pasc/sortlist.spc
+++ b/utils/sim_pasc/sortlist.spc
@@ -0,0 +1,65 @@
 
				+/*
			
 
				+	Module:	Sort Linked Lists
			
 
				+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
			
 
				+	Version:	Tue Sep 17 17:32:33 1991
			
 
				+
			
 
				+Description:
			
 
				+	This is the specification part of a generic routine that sorts linked
			
 
				+	lists. The elements in the list are structs, each of which carries a
			
 
				+	pointer to the next element.
			
 
				+
			
 
				+Instantiation, inline:
			
 
				+	For each struct list type T, specify:
			
 
				+	-	a definition of SORT_STRUCT, the struct name of the linked
			
 
				+		structs
			
 
				+	-	a definition of SORT_NAME, the name of the resulting sort
			
 
				+		routine
			
 
				+	-	a definition of a routine
			
 
				+			int SORT_BEFORE(
			
 
				+				struct SORT_STRUCT *v, struct SORT_STRUCT *w
			
 
				+			)
			
 
				+		which yields non-zero if v is to be sorted before w
			
 
				+	-	a definition of a field selector SORT_NEXT which names the
			
 
				+		field that points to the next struct SORT_STRUCT in the list
			
 
				+	-	#include	"sortlist.bdy"
			
 
				+
			
 
				+Instantiation, separate:
			
 
				+	For each struct list type T, create a file sortT.h which contains at
			
 
				+	least:
			
 
				+	-	a definition of SORT_STRUCT, the struct name of the linked
			
 
				+		structs
			
 
				+	-	a definition of SORT_NAME, the name of the resulting sort
			
 
				+		routine
			
 
				+	-	#include	"sortlist.spc"
			
 
				+
			
 
				+	This file sortT.h is to be included in all files that use the routine
			
 
				+	SORT_NAME.
			
 
				+
			
 
				+	For each struct list type T, create a file sortT.c which contains at
			
 
				+	least:
			
 
				+	-	#include	"sortT.h"
			
 
				+	-	a definition of a routine
			
 
				+			int SORT_BEFORE(
			
 
				+				struct SORT_STRUCT *v, struct SORT_STRUCT *w
			
 
				+			)
			
 
				+		which yields non-zero if v is to be sorted before w
			
 
				+	-	a definition of a field selector SORT_NEXT which names the
			
 
				+		field that points to the next struct SORT_STRUCT in the list
			
 
				+	-	#include	"sortlist.bdy"
			
 
				+
			
 
				+	This file sortT.c compiles into the module object for SORT_STRUCT.
			
 
				+
			
 
				+Specification:
			
 
				+	The module supplies:
			
 
				+	-	void SORT_NAME(struct SORT_STRUCT **listhook)
			
 
				+		where 'listhook' is a pointer to the location that holds the
			
 
				+		pointer to the list to be sorted. Upon return, the list will
			
 
				+		be sorted, and the pointer updated.
			
 
				+		The routine will be defined static when instantiated inline.
			
 
				+
			
 
				+Implementation:
			
 
				+	Linear insert sort:-(.
			
 
				+*/
			
 
				+
			
 
				+extern void SORT_NAME(struct SORT_STRUCT **);
			
 
				+#define	_SORT_EXTERN_DEFINED
			
--- a/utils/sim_pasc/stream.c
+++ b/utils/sim_pasc/stream.c
@@ -0,0 +1,56 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: stream.c,v 2.7 2001/11/08 12:30:32 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<sys/types.h>
			
 
				+#include	<sys/stat.h>
			
 
				+
			
 
				+#include	"system.par"
			
 
				+#include	"token.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"lang.h"
			
 
				+#include	"stream.h"
			
 
				+
			
 
				+static FILE *fopen_regular_file(const char *fname);
			
 
				+
			
 
				+int
			
 
				+OpenStream(const char *fname) {
			
 
				+	int ok;
			
 
				+
			
 
				+	lex_nl_cnt = 1;
			
 
				+	lex_tk_cnt = 0;
			
 
				+	lex_non_ascii_cnt = 0;
			
 
				+
			
 
				+	/* start the lex machine */
			
 
				+	yyin = fopen_regular_file(fname);
			
 
				+	ok = (yyin != 0);
			
 
				+	if (!ok) {
			
 
				+		/* fake a stream, to simplify the rest of the program */
			
 
				+		yyin = fopen(NULLFILE, "r");
			
 
				+	}
			
 
				+	yystart();
			
 
				+	return ok;
			
 
				+}
			
 
				+
			
 
				+static FILE *fopen_regular_file(const char *fname) {
			
 
				+	struct stat buf;
			
 
				+	
			
 
				+	if (stat(fname, &buf) != 0) return 0;
			
 
				+	if ((buf.st_mode & S_IFMT) != S_IFREG) return 0;
			
 
				+	return fopen(fname, "r");
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+NextStreamTokenObtained(void) {
			
 
				+	return yylex();
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+CloseStream(void) {
			
 
				+	if (yyin) {
			
 
				+		fclose(yyin);
			
 
				+		yyin = 0;
			
 
				+	}
			
 
				+}
			
--- a/utils/sim_pasc/stream.h
+++ b/utils/sim_pasc/stream.h
@@ -0,0 +1,17 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: stream.h,v 2.4 1998/02/03 14:28:36 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	Interface of the stream module.
			
 
				+
			
 
				+	Implements the direct interaction with the lexical
			
 
				+	module.  It supplies the routines below.
			
 
				+*/
			
 
				+
			
 
				+#include	"token.h"
			
 
				+
			
 
				+extern int OpenStream(const char *);
			
 
				+extern int NextStreamTokenObtained(void);
			
 
				+extern void CloseStream(void);
			
--- a/utils/sim_pasc/sysidf.mk
+++ b/utils/sim_pasc/sysidf.mk
@@ -0,0 +1,17 @@
 
				+#	I N S T A L L A T I O N   P A R A M E T E R S
			
 
				+
			
 
				+BINDIR =	/home/dick/bin.`arch`
			
 
				+MANDIR =	/home/dick/man/man1
			
 
				+FTPDIR =	/usr/local/ftpd/pub/dick/similarity_tester
			
 
				+
			
 
				+#	C O M P I L A T I O N   P A R A M E T E R S
			
 
				+
			
 
				+EXE =		#
			
 
				+CC =		gcc -pedantic -Wall
			
 
				+LEX =		flex
			
 
				+COPY =		cp -p
			
 
				+ZIP =		zip -o
			
 
				+LINT =		lint -ansi $(SYSTEM)
			
 
				+LINTFLAGS =	-xh
			
 
				+
			
 
				+SYSTEM =	-DUNIX
			
--- a/utils/sim_pasc/sysidf.msdos
+++ b/utils/sim_pasc/sysidf.msdos
@@ -0,0 +1,17 @@
 
				+#	I N S T A L L A T I O N   P A R A M E T E R S
			
 
				+
			
 
				+BINDIR =	/com
			
 
				+MANDIR =	/man
			
 
				+
			
 
				+
			
 
				+#	C O M P I L A T I O N   P A R A M E T E R S
			
 
				+
			
 
				+EXE =		.exe
			
 
				+CC =		gcc -pedantic -Wall
			
 
				+LEX =		flex
			
 
				+COPY =		xcopy
			
 
				+ZIP =		pkzip -ko
			
 
				+ATFILEARGS =	gcc.exe:ar.exe:lint.exe## use DOS at-convention for these
			
 
				+LINT =		lint -ansi $(SYSTEM)
			
 
				+
			
 
				+SYSTEM =	-DMSDOS
			
--- a/utils/sim_pasc/sysidf.unix
+++ b/utils/sim_pasc/sysidf.unix
@@ -0,0 +1,19 @@
 
				+#	I N S T A L L A T I O N   P A R A M E T E R S
			
 
				+
			
 
				+BINDIR =	/home/dick/bin.`arch`
			
 
				+MANDIR =	/home/dick/man/man1
			
 
				+FTPDIR =	/usr/local/ftpd/pub/dick/similarity_tester
			
 
				+FTPFILES =	README.1st READ_ME TechnReport
			
 
				+VERSION =	2_19
			
 
				+
			
 
				+#	C O M P I L A T I O N   P A R A M E T E R S
			
 
				+
			
 
				+EXE =		#
			
 
				+CC =		gcc -pedantic -Wall
			
 
				+LEX =		flex
			
 
				+COPY =		cp -p
			
 
				+ZIP =		zip -o
			
 
				+LINT =		lint -ansi $(SYSTEM)
			
 
				+LINTFLAGS =	-xh
			
 
				+
			
 
				+SYSTEM =	-DUNIX
			
--- a/utils/sim_pasc/system.par
+++ b/utils/sim_pasc/system.par
@@ -0,0 +1,20 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: system.par,v 1.2 2001/09/28 09:03:55 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Operating-system dependent data */
			
 
				+
			
 
				+#ifdef	UNIX
			
 
				+
			
 
				+#define	int32		int		/* type of a 32 bits signed int */
			
 
				+#define	NULLFILE	"/dev/null"
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifdef	MSDOS		/* GNU gcc */
			
 
				+
			
 
				+#define	int32		int		/* type of a 32 bits signed int */
			
 
				+#define	NULLFILE	"nul"
			
 
				+
			
 
				+#endif
			
--- a/utils/sim_pasc/text.c
+++ b/utils/sim_pasc/text.c
@@ -0,0 +1,236 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: text.c,v 1.2 2001/11/13 12:55:58 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+#include	<malloc.h>
			
 
				+
			
 
				+#include	"debug.par"
			
 
				+#include	"sim.h"
			
 
				+#include	"token.h"
			
 
				+#include	"stream.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"options.h"
			
 
				+#include	"error.h"
			
 
				+#include	"text.h"
			
 
				+
			
 
				+struct newline {
			
 
				+	unsigned char nl_tk_diff;	/* token position difference */
			
 
				+};
			
 
				+
			
 
				+#define	NL_INCR		1000		/* increment of newline buffer size */
			
 
				+
			
 
				+static struct newline *nl_buff;		/* to be filled by malloc */
			
 
				+static unsigned int nl_size;		/* size of nl_buff[] */
			
 
				+static unsigned int nl_free;		/* next free position in nl_buff[] */
			
 
				+
			
 
				+static unsigned int nl_next, nl_limit;	/* nl_buff[] pointers during pass 2 */
			
 
				+
			
 
				+static void store_newline(void);
			
 
				+static void init_nl_buff(void);
			
 
				+
			
 
				+/*							TEXT INTERFACE */
			
 
				+
			
 
				+static unsigned int last_tk_cnt;	/* token count at newline */
			
 
				+static unsigned int last_nl_cnt;	/* nl counter during pass 2 */
			
 
				+
			
 
				+void
			
 
				+InitText(int nfiles) {
			
 
				+	/* allocate the array of text descriptors */
			
 
				+	NumberOfTexts = nfiles;
			
 
				+	Text = (struct text *)
			
 
				+		malloc((unsigned int)(NumberOfTexts*sizeof (struct text)));
			
 
				+	if (!Text) fatal("out of memory");
			
 
				+
			
 
				+	init_nl_buff();
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+OpenText(enum Pass pass, struct text *txt) {
			
 
				+	switch (pass) {
			
 
				+	case First:
			
 
				+		last_tk_cnt = 0;
			
 
				+		if (nl_buff) {
			
 
				+			txt->tx_nl_start = nl_free;
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case Second:
			
 
				+		last_tk_cnt = 0;
			
 
				+		if (nl_buff) {
			
 
				+			nl_next = txt->tx_nl_start;
			
 
				+			nl_limit = txt->tx_nl_limit;
			
 
				+			last_nl_cnt = 1;
			
 
				+			lex_nl_cnt = 1;
			
 
				+			lex_tk_cnt = 0;
			
 
				+			return 1;
			
 
				+		}
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return OpenStream(txt->tx_fname);
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+NextTextTokenObtained(enum Pass pass) {
			
 
				+	register int ok = 0;	/* gcc does not understand enum Pass */
			
 
				+
			
 
				+	switch (pass) {
			
 
				+	case First:
			
 
				+		ok = NextStreamTokenObtained();
			
 
				+		if (TOKEN_EQ(lex_token, EOL)) {
			
 
				+			store_newline();
			
 
				+			last_tk_cnt = lex_tk_cnt;
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case Second:
			
 
				+		/* get newline info from the buffer or from the file itself */
			
 
				+		if (nl_buff) {
			
 
				+			if (nl_next == nl_limit) {
			
 
				+				ok = 0;
			
 
				+			}
			
 
				+			else {
			
 
				+				struct newline *nl = &nl_buff[nl_next++];
			
 
				+
			
 
				+				lex_nl_cnt = ++last_nl_cnt;
			
 
				+				lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff);
			
 
				+				lex_token = EOL;
			
 
				+				ok = 1;
			
 
				+			}
			
 
				+		}
			
 
				+		else {
			
 
				+			while (	(ok = NextStreamTokenObtained())
			
 
				+			&&	!TOKEN_EQ(lex_token, EOL)
			
 
				+			) {
			
 
				+				/* skip */
			
 
				+			}
			
 
				+		}
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return ok;
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+CloseText(enum Pass pass, struct text *txt) {
			
 
				+	switch (pass) {
			
 
				+	case First:
			
 
				+		if (nl_buff) {
			
 
				+			if (last_tk_cnt != lex_tk_cnt) {
			
 
				+				/* there were tokens after the last newline */
			
 
				+				store_newline();
			
 
				+			}
			
 
				+			txt->tx_nl_limit = nl_free;
			
 
				+		}
			
 
				+		break;
			
 
				+	case Second:
			
 
				+		break;
			
 
				+	}
			
 
				+	CloseStream();
			
 
				+}
			
 
				+
			
 
				+/*							NEWLINE CACHING */
			
 
				+
			
 
				+/*	To speed up pass2 which is interested in token positions at line ends,
			
 
				+	the newline buffer keeps this info from pass1. To reduce the size of
			
 
				+	the newline buffer, the info is kept as the differences of the values
			
 
				+	at consecutive line ends. This allows unsigned chars to be used rather
			
 
				+	than integers.
			
 
				+
			
 
				+	The recording of token position differences at EOL is optional, and
			
 
				+	is switched off if
			
 
				+	-	there is not room enough for the newline buffer.
			
 
				+	-	a difference would not fit in the field in the struct.
			
 
				+	Switching off is done by freeing the buffer and setting nl_buff to 0.
			
 
				+	Anybody using nl_buff should therefore test for nl_buff being zero.
			
 
				+*/
			
 
				+
			
 
				+static void abandon_nl_buff(void);
			
 
				+
			
 
				+static void
			
 
				+init_nl_buff(void) {
			
 
				+	/* Allocate the newline buffer, if possible */
			
 
				+	nl_size = 0 + NL_INCR;
			
 
				+	nl_buff = (option_set('x') ? 0 :
			
 
				+		(struct newline *)malloc(sizeof (struct newline) * nl_size)
			
 
				+	);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+store_newline(void) {
			
 
				+	if (!nl_buff) return;
			
 
				+
			
 
				+	if (nl_free == nl_size) {
			
 
				+		/* allocated array is full; try to increase its size */
			
 
				+		unsigned int new_size = nl_size + NL_INCR;
			
 
				+		struct newline *new_buff = (struct newline *)realloc(
			
 
				+			(char *)nl_buff,
			
 
				+			sizeof (struct newline) * new_size
			
 
				+		);
			
 
				+
			
 
				+		if (!new_buff) {
			
 
				+			/* we failed */
			
 
				+			abandon_nl_buff();
			
 
				+			return;
			
 
				+		}
			
 
				+		nl_buff = new_buff, nl_size = new_size;
			
 
				+	}
			
 
				+
			
 
				+	/* now we are sure there is room enough */
			
 
				+	{
			
 
				+		register struct newline *nl = &nl_buff[nl_free++];
			
 
				+		register unsigned int tk_diff = lex_tk_cnt - last_tk_cnt;
			
 
				+
			
 
				+		nl->nl_tk_diff = tk_diff;
			
 
				+		if (nl->nl_tk_diff != tk_diff) {
			
 
				+			/* tk_diff does not fit in nl_tk_diff */
			
 
				+			abandon_nl_buff();
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+abandon_nl_buff(void) {
			
 
				+	if (nl_buff) {
			
 
				+		free((char *)nl_buff);
			
 
				+		nl_buff = 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef	DB_NL_BUFF
			
 
				+
			
 
				+void
			
 
				+db_print_nl_buff(unsigned int start, unsigned int limit) {
			
 
				+	int i;
			
 
				+
			
 
				+	fprintf(DebugFile, "\n**** DB_NL_BUFF ****\n");
			
 
				+	if (!nl_buff) {
			
 
				+		fprintf(DebugFile, ">>>> NO NL_BUFF\n\n");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (start > nl_free) {
			
 
				+		fprintf(DebugFile, ">>>> start (%u) > nl_free (%u)\n\n",
			
 
				+			start, nl_free
			
 
				+		);
			
 
				+		return;
			
 
				+	}
			
 
				+	if (limit > nl_free) {
			
 
				+		fprintf(DebugFile, ">>>> limit (%u) > nl_free (%u)\n\n",
			
 
				+			limit, nl_free
			
 
				+		);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	fprintf(DebugFile, "nl_buff: %u entries:\n", nl_free);
			
 
				+	for (i = start; i < limit; i++) {
			
 
				+		struct newline *nl = &nl_buff[i];
			
 
				+
			
 
				+		fprintf(DebugFile, "nl_tk_diff = %d\n", nl->nl_tk_diff);
			
 
				+	}
			
 
				+	fprintf(DebugFile, "\n");
			
 
				+}
			
 
				+
			
 
				+#endif	/* DB_NL_BUFF */
			
--- a/utils/sim_pasc/text.h
+++ b/utils/sim_pasc/text.h
@@ -0,0 +1,20 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: text.h,v 1.2 2001/09/28 09:03:56 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*	Implements the access to the lexical scanner.
			
 
				+	Additionally, the module tries to save newline information,
			
 
				+	anticipating a second scan which is interested in this
			
 
				+	information only.
			
 
				+*/
			
 
				+
			
 
				+extern void InitText(int nfiles);
			
 
				+enum Pass {First, Second};
			
 
				+extern int OpenText(enum Pass pass, struct text *txt);
			
 
				+extern int NextTextTokenObtained(enum Pass pass);
			
 
				+extern void CloseText(enum Pass pass, struct text *txt);
			
 
				+
			
 
				+#ifdef	DB_NL_BUFF
			
 
				+extern void db_print_nl_buff(unsigned int start, unsigned int limit);
			
 
				+#endif
			
--- a/utils/sim_pasc/textlang.l
+++ b/utils/sim_pasc/textlang.l
@@ -0,0 +1,72 @@
 
				+%{
			
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: textlang.l,v 1.3 2007/08/29 09:10:36 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	Text front end for the similarity tester.
			
 
				+*/
			
 
				+
			
 
				+#include	"language.h"
			
 
				+#include	"token.h"
			
 
				+#include	"idf.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"lang.h"
			
 
				+
			
 
				+/* Language-dependent Code */
			
 
				+
			
 
				+void
			
 
				+InitLanguage(void) {
			
 
				+}
			
 
				+
			
 
				+/*ARGSUSED*/
			
 
				+int
			
 
				+MayBeStartOfRun(TOKEN tk) {
			
 
				+	/* any token is acceptable */
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/*ARGSUSED*/
			
 
				+unsigned int
			
 
				+CheckRun(const TOKEN *str, unsigned int size) {
			
 
				+	/* any run is acceptable */
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+%}
			
 
				+
			
 
				+%option nounput
			
 
				+%option never-interactive
			
 
				+
			
 
				+Layout		([ \t\r\f])
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+[^ \t\n]+	{			/* a word */
			
 
				+		/*	a word is defined as anything not containing
			
 
				+			layout
			
 
				+		*/
			
 
				+		return_tk(idf_hashed(yytext));
			
 
				+	}
			
 
				+
			
 
				+\n	{				/* count newlines */
			
 
				+		return_eol();
			
 
				+	}
			
 
				+
			
 
				+{Layout}	{			/* ignore layout */
			
 
				+	}
			
 
				+
			
 
				+%%
			
 
				+
			
 
				+/* Language-INdependent Code */
			
 
				+
			
 
				+void
			
 
				+yystart(void) {
			
 
				+	BEGIN INITIAL;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+yywrap(void) {
			
 
				+	return 1;
			
 
				+}
			
--- a/utils/sim_pasc/token.c
+++ b/utils/sim_pasc/token.c
@@ -0,0 +1,44 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: token.c,v 2.4 2001/11/13 12:55:58 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	Token interface, implementation part.
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+
			
 
				+#include	"token.h"
			
 
				+
			
 
				+void
			
 
				+print_token(FILE *ofile, TOKEN tk) {
			
 
				+	/*	prints a token, in two characters:
			
 
				+			normal char		meta (bit 8 set)
			
 
				+			^A	cntl		$A	meta-cntl
			
 
				+			 A	printable	#A	meta
			
 
				+			^?	DEL		$?	meta-DEL
			
 
				+	*/
			
 
				+	register int ch =   TOKEN2int(tk) & 0177;
			
 
				+	register int meta = TOKEN2int(tk) & 0200;
			
 
				+
			
 
				+	if (' ' <= ch && ch <= '~') {
			
 
				+		fprintf(ofile, "%c%c", (meta ? '#' : ' '), ch);
			
 
				+	}
			
 
				+	else {
			
 
				+		fprintf(ofile, "%c%c",
			
 
				+			(meta ? '$' : '^'),
			
 
				+			(ch == 0177 ? '?' : ch + '@')
			
 
				+		);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef	TESTTOKEN
			
 
				+
			
 
				+int
			
 
				+TOKEN_EQ(TOKEN t1, TOKEN t2) {
			
 
				+	/* to make sure TOKEN_EQ is indeed called with two TOKEN parameters */
			
 
				+	return TOKEN2int(t1) == TOKEN2int(t2);
			
 
				+}
			
 
				+
			
 
				+#endif	/* TESTTOKEN */
			
--- a/utils/sim_pasc/token.h
+++ b/utils/sim_pasc/token.h
@@ -0,0 +1,52 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: token.h,v 2.4 2001/11/13 12:55:59 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+	Token interface.
			
 
				+	Since the definition of a token has been a continual source of
			
 
				+	problems, it is now defined as an Abstract Data Type.
			
 
				+	To allow stronger type checking, there is a special version for use
			
 
				+	by lint.
			
 
				+*/
			
 
				+
			
 
				+#include	<stdio.h>
			
 
				+
			
 
				+#ifndef	TOKEN
			
 
				+
			
 
				+#ifdef	lint
			
 
				+#define	TESTTOKEN
			
 
				+#endif
			
 
				+
			
 
				+#ifdef	TESTTOKEN				/* strict version */
			
 
				+
			
 
				+struct cccc {
			
 
				+	int cccc;
			
 
				+};
			
 
				+
			
 
				+typedef struct cccc *lintTOKEN;
			
 
				+#define	TOKEN		lintTOKEN
			
 
				+#define	TOKEN2int(c)	((int)(c))
			
 
				+#define	int2TOKEN(i)	((TOKEN)(i))
			
 
				+extern int TOKEN_EQ(TOKEN t1, TOKEN t2);
			
 
				+
			
 
				+#else						/* production version */
			
 
				+
			
 
				+#define	TOKEN		unsigned char
			
 
				+#define	TOKEN2int(c)	((c)&0377)
			
 
				+#define	int2TOKEN(i)	((TOKEN)(i))
			
 
				+#define	TOKEN_EQ(t1,t2)	(TOKEN2int(t1) == TOKEN2int(t2))
			
 
				+
			
 
				+#endif	/* TESTTOKEN */
			
 
				+
			
 
				+#endif	/* TOKEN */
			
 
				+
			
 
				+/* Macros for the composition of tokens */
			
 
				+#define	NORM(ch)	int2TOKEN((ch)&0377)
			
 
				+#define	CTRL(ch)	int2TOKEN((ch)&0037)
			
 
				+#define	META(ch)	int2TOKEN((ch)|0200)
			
 
				+#define	MTCT(ch)	int2TOKEN(((ch)&0037)|0200)
			
 
				+#define	NOTOKEN		int2TOKEN(0)
			
 
				+
			
 
				+extern void print_token(FILE *ofile, TOKEN tk);	/* in two characters */
			
--- a/utils/sim_pasc/tokenarray.c
+++ b/utils/sim_pasc/tokenarray.c
@@ -0,0 +1,52 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: tokenarray.c,v 1.2 2001/11/13 12:55:59 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	<malloc.h>
			
 
				+
			
 
				+#include	"error.h"
			
 
				+#include	"lex.h"
			
 
				+#include	"tokenarray.h"
			
 
				+
			
 
				+#define	TK_INCR		10000		/* increment of token array size */
			
 
				+
			
 
				+TOKEN *TokenArray;			/* to be filled by malloc */
			
 
				+static unsigned int tk_size;		/* size of TokenArray[] */
			
 
				+static unsigned int tk_free;		/* next free position in TokenArray[] */
			
 
				+
			
 
				+void
			
 
				+InitTokenArray(void) {
			
 
				+	tk_size = TK_INCR;
			
 
				+	TokenArray = (TOKEN *)malloc(sizeof (TOKEN) * tk_size);
			
 
				+	if (!TokenArray) fatal("out of memory");
			
 
				+	tk_free = 1;		/* don't use position 0 */
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+StoreToken(void) {
			
 
				+	if (tk_free == tk_size) {
			
 
				+		/* allocated array is full; try to increase its size */
			
 
				+		unsigned int new_size = tk_size + TK_INCR;
			
 
				+		register TOKEN *new_array = (TOKEN *)realloc(
			
 
				+			(char *)TokenArray,
			
 
				+			sizeof (TOKEN) * new_size
			
 
				+		);
			
 
				+
			
 
				+		if (new_size < tk_free)
			
 
				+			fatal("internal error: TK_INCR causes numeric overflow");
			
 
				+		if (!new_array) {
			
 
				+			/* we failed */
			
 
				+			fatal("out of memory");
			
 
				+		}
			
 
				+		TokenArray = new_array, tk_size = new_size;
			
 
				+	}
			
 
				+
			
 
				+	/* now we are sure there is room enough */
			
 
				+	TokenArray[tk_free++] = lex_token;
			
 
				+}
			
 
				+
			
 
				+unsigned int
			
 
				+TextLength(void) {
			
 
				+	return tk_free;
			
 
				+}
			
--- a/utils/sim_pasc/tokenarray.h
+++ b/utils/sim_pasc/tokenarray.h
@@ -0,0 +1,13 @@
 
				+/*	This file is part of the software similarity tester SIM.
			
 
				+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
			
 
				+	$Id: tokenarray.h,v 1.1 2001/09/28 09:03:42 dick Exp $
			
 
				+*/
			
 
				+
			
 
				+#include	"token.h"
			
 
				+
			
 
				+/* Interface for the token storage */
			
 
				+extern void InitTokenArray(void);
			
 
				+extern void StoreToken(void);
			
 
				+extern unsigned int TextLength(void);	/* also first free token position */
			
 
				+extern TOKEN *TokenArray;
			
 
				+