18 years ago · 30e4da99da
--- a/.gitattributes
+++ b/.gitattributes
@@ -8965,6 +8965,72 @@ utils/ptop.pp svneol=native#text/plain
 
															 utils/ptopu.pp svneol=native#text/plain
														
 
															 utils/rmcvsdir.pp svneol=native#text/plain
														
 
															 utils/rstconv.pp svneol=native#text/plain
														
 
															+utils/sim_pasc/Answers svneol=native#text/plain
														
 
															+utils/sim_pasc/ChangeLog svneol=native#text/plain
														
 
															+utils/sim_pasc/LICENSE.txt svneol=native#text/plain
														
 
															+utils/sim_pasc/Makefile svneol=native#text/plain
														
 
															+utils/sim_pasc/READ.ME svneol=native#text/plain
														
 
															+utils/sim_pasc/README.1st svneol=native#text/plain
														
 
															+utils/sim_pasc/READ_ME svneol=native#text/plain
														
 
															+utils/sim_pasc/TechnReport svneol=native#text/plain
														
 
															+utils/sim_pasc/add_run.c svneol=native#text/plain
														
 
															+utils/sim_pasc/add_run.h svneol=native#text/plain
														
 
															+utils/sim_pasc/aiso.bdy svneol=native#text/plain
														
 
															+utils/sim_pasc/aiso.spc svneol=native#text/plain
														
 
															+utils/sim_pasc/algollike.c svneol=native#text/plain
														
 
															+utils/sim_pasc/algollike.h svneol=native#text/plain
														
 
															+utils/sim_pasc/clang.l svneol=native#text/plain
														
 
															+utils/sim_pasc/compare.c svneol=native#text/plain
														
 
															+utils/sim_pasc/compare.h svneol=native#text/plain
														
 
															+utils/sim_pasc/debug.par svneol=native#text/plain
														
 
															+utils/sim_pasc/error.c svneol=native#text/plain
														
 
															+utils/sim_pasc/error.h svneol=native#text/plain
														
 
															+utils/sim_pasc/hash.c svneol=native#text/plain
														
 
															+utils/sim_pasc/hash.h svneol=native#text/plain
														
 
															+utils/sim_pasc/idf.c svneol=native#text/plain
														
 
															+utils/sim_pasc/idf.h svneol=native#text/plain
														
 
															+utils/sim_pasc/javalang.l svneol=native#text/plain
														
 
															+utils/sim_pasc/lang.h svneol=native#text/plain
														
 
															+utils/sim_pasc/language.h svneol=native#text/plain
														
 
															+utils/sim_pasc/lex.c svneol=native#text/plain
														
 
															+utils/sim_pasc/lex.h svneol=native#text/plain
														
 
															+utils/sim_pasc/lisplang.l svneol=native#text/plain
														
 
															+utils/sim_pasc/m2lang.l svneol=native#text/plain
														
 
															+utils/sim_pasc/miralang.l svneol=native#text/plain
														
 
															+utils/sim_pasc/options.c svneol=native#text/plain
														
 
															+utils/sim_pasc/options.h svneol=native#text/plain
														
 
															+utils/sim_pasc/pascallang.l svneol=native#text/plain
														
 
															+utils/sim_pasc/pass1.c svneol=native#text/plain
														
 
															+utils/sim_pasc/pass1.h svneol=native#text/plain
														
 
															+utils/sim_pasc/pass2.c svneol=native#text/plain
														
 
															+utils/sim_pasc/pass2.h svneol=native#text/plain
														
 
															+utils/sim_pasc/pass3.c svneol=native#text/plain
														
 
															+utils/sim_pasc/pass3.h svneol=native#text/plain
														
 
															+utils/sim_pasc/percentages.c svneol=native#text/plain
														
 
															+utils/sim_pasc/percentages.h svneol=native#text/plain
														
 
															+utils/sim_pasc/runs.c svneol=native#text/plain
														
 
															+utils/sim_pasc/runs.h svneol=native#text/plain
														
 
															+utils/sim_pasc/settings.par svneol=native#text/plain
														
 
															+utils/sim_pasc/sim.1 svneol=native#text/plain
														
 
															+utils/sim_pasc/sim.c svneol=native#text/plain
														
 
															+utils/sim_pasc/sim.h svneol=native#text/plain
														
 
															+utils/sim_pasc/sim.html svneol=native#text/plain
														
 
															+utils/sim_pasc/sim.txt svneol=native#text/plain
														
 
															+utils/sim_pasc/sortlist.bdy svneol=native#text/plain
														
 
															+utils/sim_pasc/sortlist.spc svneol=native#text/plain
														
 
															+utils/sim_pasc/stream.c svneol=native#text/plain
														
 
															+utils/sim_pasc/stream.h svneol=native#text/plain
														
 
															+utils/sim_pasc/sysidf.mk svneol=native#text/plain
														
 
															+utils/sim_pasc/sysidf.msdos svneol=native#text/plain
														
 
															+utils/sim_pasc/sysidf.unix svneol=native#text/plain
														
 
															+utils/sim_pasc/system.par svneol=native#text/plain
														
 
															+utils/sim_pasc/text.c svneol=native#text/plain
														
 
															+utils/sim_pasc/text.h svneol=native#text/plain
														
 
															+utils/sim_pasc/textlang.l svneol=native#text/plain
														
 
															+utils/sim_pasc/token.c svneol=native#text/plain
														
 
															+utils/sim_pasc/token.h svneol=native#text/plain
														
 
															+utils/sim_pasc/tokenarray.c svneol=native#text/plain
														
 
															+utils/sim_pasc/tokenarray.h svneol=native#text/plain
														
 
															 utils/simulator/Makefile svneol=native#text/plain
														
 
															 utils/simulator/Makefile.fpc svneol=native#text/plain
														
 
															 utils/simulator/alphasim.pas svneol=native#text/plain
														
--- a/utils/sim_pasc/Answers
+++ b/utils/sim_pasc/Answers
@@ -0,0 +1,57 @@
 
															+		The software and text similarity tester SIM
														
 
															+
														
 
															+SIM tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp,
														
 
															+Miranda, and natural language.  It is used
														
 
															+
														
 
															+- to detect potentially duplicated code fragments in large software projects,
														
 
															+	in program text but also in shell scripts and documentation;
														
 
															+- to detect plagiarism in software projects, educational and otherwise.
														
 
															+
														
 
															+SIM is available through ftp.  The directory
														
 
															+
														
 
															+	ftp.cs.vu.nl:pub/dick/similarity_tester
														
 
															+
														
 
															+contains the sources (in C) and the MSDOS .EXEs.
														
 
															+
														
 
															+The software similarity tester is very efficient and allows us to compare
														
 
															+this year's students' work with that collected from many past years (much to
														
 
															+the dismay of some, mostly non-CS, students).  Students are told in advance
														
 
															+that their work is going to be compared, but some are non-believers ...
														
 
															+
														
 
															+The output of the similarity tester can be processed by a number of shell
														
 
															+scripts by Matty Huntjens.  These shell scripts take sim output and produce
														
 
															+lists of suspect submissions, histograms and the like.
														
 
															+The present version of these scripts is very much geared to the local situation
														
 
															+at the Vrije Universiteit, though; they are low on portability.
														
 
															+Matty Huntjens' email address is [email protected].
														
 
															+
														
 
															+We are not afraid that students would try to tune their work to the
														
 
															+similarity tester.  We reckon if they can do that they can also do the
														
 
															+exercise.
														
 
															+
														
 
															+Since this piece of handicraft does not qualify as research, there are no
														
 
															+international papers on it.  A paper, titled `Detecting copied submissions in
														
 
															+computer science lab work', was published in a local (i.e. Dutch) computer
														
 
															+science journal:
														
 
															+
														
 
															+%A Dick Grune
														
 
															+%A Matty Huntjens
														
 
															+%T Het detecteren van kopie\(:en bij informatica-practica
														
 
															+%J Informatie (in Dutch)
														
 
															+%V 31
														
 
															+%N 11
														
 
															+%D Nov 1989
														
 
															+%P 864-867
														
 
															+
														
 
															+The ftp directory contains a terse technical report about the internal
														
 
															+working of the program.
														
 
															+
														
 
															+					Dick Grune
														
 
															+					Vrije Universiteit
														
 
															+					de Boelelaan 1081
														
 
															+					1081 HV  Amsterdam
														
 
															+					the Netherlands
														
 
															+					[email protected]
														
 
															+					+31 20 444 7744
														
 
															+----------------------------------------------------------------
														
 
															+With infinitely many exceptions, what you do makes no difference.
														
--- a/utils/sim_pasc/ChangeLog
+++ b/utils/sim_pasc/ChangeLog
@@ -0,0 +1,580 @@
 
															+2007-08-23  Dick Grune  <[email protected]>
														
 
															+	LICENSE.txt added.
														
 
															+
														
 
															+2006-11-27  Dick Grune  <[email protected]>
														
 
															+	Removal of setbuff() for compatibility.
														
 
															+
														
 
															+2005-01-17  Dick Grune  <[email protected]>
														
 
															+	Corrections by Jerry James <[email protected]>; ANSIizing, etc.
														
 
															+
														
 
															+2004-08-05  Dick Grune  <[email protected]>
														
 
															+	Finished the 'percentage' option.
														
 
															+
														
 
															+08-Nov-2001	Dick Grune
														
 
															+	Begun to add a 'percentage' option, which will express the
														
 
															+	similarity between two files in percents.
														
 
															+
														
 
															+27-Sep-2001	Dick Grune
														
 
															+	Split add_run() off from compare.c into add_run.c, to accomodate
														
 
															+	different add_run()s, for different types of processing.
														
 
															+
														
 
															+27-Nov-1998	Dick Grune
														
 
															+	Installed a Miranda version supplied by Emma Norling ([email protected])
														
 
															+
														
 
															+23-Feb-1998	Dick Grune
														
 
															+	Renamed text.l to textlang.l for uniformity and to make room for
														
 
															+	a possible module text.[ch].
														
 
															+
														
 
															+	Isolated a module for handling the token array from buff.[ch] to
														
 
															+	tokenarray.[ch], and renamed buff.[ch] to text.[ch].
														
 
															+
														
 
															+23-Feb-1998	Dick Grune
														
 
															+	There is probably not much point in abandoning the nl_buff list
														
 
															+	when running out of memory for TokenArray[]: each token costs 1
														
 
															+	byte for the token and 4 bytes for the entry in
														
 
															+	forward_references[], a total of 5 bytes.  There are about 3
														
 
															+	tokens to a line, together requiring 15 bytes, plus 1 byte in
														
 
															+	nl_buff yields 16 bytes.  So releasing nl_buff frees only 1/16 =
														
 
															+	6.7 % of memeory.
														
 
															+
														
 
															+	Since the code is a bother, I removed it.  Note that nl_buff is
														
 
															+	still abandoned when the number of tokens in a line does not fit
														
 
															+	in one unsigned char (but that is not very likely to happen).
														
 
															+
														
 
															+	
														
 
															+21-Feb-1998	Dick Grune
														
 
															+	Printing got into an infinite loop when the last line of the
														
 
															+	input was not terminated by a newline AND contained tokens that
														
 
															+	were included in a matching run.
														
 
															+	This was due to a double bug: 1. the non-terminated line was not
														
 
															+	registered properly in NextTextTokenObtained() / CloseText(),
														
 
															+	and 2. the loop in pass 2 which sets the values of
														
 
															+	pos->ps_nl_cnt was terminated prematurely when the file turned
														
 
															+	out to be shorter than the list of pos-es indicated.
														
 
															+	Both bugs were corrected, the first by supplying an extra
														
 
															+	newline in CloseText() when one is found missing, and the second
														
 
															+	by rewriting the list-parallel loop in pass 2.
														
 
															+
														
 
															+02-Feb-1998	Dick Grune
														
 
															+	Pascal does not differentiate between strings and characters
														
 
															+	(strings of one character); this difference has been removed
														
 
															+	from pascallang.l.
														
 
															+
														
 
															+22-Jan-1998	Dick Grune
														
 
															+	Detection of non-ASCII characters added.  Since the lexical
														
 
															+	analyser itself generates non-ASCII characters, the test must occur
														
 
															+	earlier.  We could replace the input routine of lex by a
														
 
															+	checking routine, but with several lex-es going around, we want
														
 
															+	a more lex-independent solution.  To allow each language its own
														
 
															+	restrictions about non-ASCII characters, the check is
														
 
															+	implemented in the *lang.l files.
														
 
															+
														
 
															+28-Nov-1997	Dick Grune
														
 
															+	Changed the name of the C similarity tester 'sim' to 'sim_c', for
														
 
															+	uniformity with sim_java, etc.
														
 
															+
														
 
															+23-Nov-1997	Dick Grune
														
 
															+	Java version finished; checked by Matty Huntjens and crew.
														
 
															+
														
 
															+24-Jun-1997	Dick Grune
														
 
															+	Started on a Java version, by copying the C version.
														
 
															+
														
 
															+22-Jun-1997	Dick Grune
														
 
															+	Modern lexical analysers, among which flex, read the entire input into
														
 
															+	a buffer before they issue the first token.  As a result, ftell() no
														
 
															+	longer gives a usable indication of the position of a token in a file.
														
 
															+	This pulls the rug from under the nl_buff mechanism in buff.c, which
														
 
															+	is removed.  We loose a valuable optimization this way, but there just
														
 
															+	seems to be no way to keep it.
														
 
															+
														
 
															+	Note that this has nothing to do with the problem in MS-DOS of
														
 
															+	character count and fseek position not being synchronized.  That
														
 
															+	problem has been solved on June 14, 1991 (which see) and the code has
														
 
															+	been running OK since.
														
 
															+
														
 
															+18-Jun-1997	Dick Grune
														
 
															+	The thought has occurred to use McCreight's linear longest common
														
 
															+	substring algorithm rather than the existing algorithm, which has a
														
 
															+	small quadratic component.  There are a couple of problems with this:
														
 
															+	1.	We need the longest >non-overlapping< common substring;
														
 
															+		McCreight provides just the longest.  It is not at all clear
														
 
															+		how to modify the algorithm.
														
 
															+	2.	Once we have found our LCS, we want to find the
														
 
															+		one-but-longest; it is far from obvious how to do that in
														
 
															+		McCreight's algorithm.
														
 
															+	3.	Once we have found our LCS, we want to take one of its
														
 
															+		copies out of the game, to suppress duplicate messages.
														
 
															+		Again, it is difficult to see how to do that, without
														
 
															+		redoing all the calculations.
														
 
															+	4.	McCreight's algorithm seems to require about two binary
														
 
															+		tree nodes per token, say 8 bytes, which is double we
														
 
															+		use now.
														
 
															+
														
 
															+17-Jun-1997	Dick Grune
														
 
															+	Did some experimenting with the hash function; it is still
														
 
															+	pretty bad: the simple-minded second sweep through
														
 
															+	forward_references easily removes another 80-99% of false hits.
														
 
															+	Next, a third sweep that does a full comparison will remove another
														
 
															+	large percentage.
														
 
															+	
														
 
															+	So I have left in the second sweep in all cases.
														
 
															+	
														
 
															+	There are a couple of questions here:
														
 
															+	1. Can we find a better hash function, or will we forever need a
														
 
															+		second sweep?
														
 
															+	2. Does it actually matter, or will we loose on more expensive
														
 
															+		hashing what we gain by having a better set of forward
														
 
															+		references in compare.c?
														
 
															+
														
 
															+
														
 
															+16-Jun-1997	Dick Grune
														
 
															+	Cleaned up sim.h and renamed aiso.[ch] to runs.[ch] since they
														
 
															+	are instantiations of the aiso module concerned with runs.
														
 
															+	Aiso.[spc|bdy] stays aiso.[spc|bdy], of course.
														
 
															+
														
 
															+16-Jun-1997	Dick Grune
														
 
															+	Redid largest_function() in algollike.c.
														
 
															+	Corrected bug in CheckRun; it now always removes NonFinals from
														
 
															+	the end, even when it has first applied largest_function().
														
 
															+
														
 
															+15-Jun-1997	Dick Grune
														
 
															+	Reorganized the layers around the input file.  There were and
														
 
															+	still are three layers: lang, stream and buff.
														
 
															+
														
 
															+	Since the lex_X variables are hoisted unchanged through the levels
														
 
															+	lang, stream, and buff, to be used by pass1, pass2, etc., they
														
 
															+	have to be placed in a module of their own.
														
 
															+
														
 
															+	The token-providing module 'lang' has three interfaces:
														
 
															+	-	lang.h, which provides access to the lowest-level token
														
 
															+			routines, to be used by the next level.
														
 
															+	-	lex.h, which provides the lex variables, to be used by
														
 
															+			all and sundry.
														
 
															+	-	language.h, which provides language-specific info about
														
 
															+			tokens, concerning their suitability as initial
														
 
															+			and final tokens, to be used by higher levels.
														
 
															+			
														
 
															+	This structure is not satisfactory, but it is also unreasonable
														
 
															+	to combine them in one interface.
														
 
															+
														
 
															+	There is no single lang.c; rather it is represented by the
														
 
															+	various Xlang.c files generated from the Xlang.l files.
														
 
															+
														
 
															+14-Jun-1997	Dick Grune
														
 
															+	Added a Makefile zip entry to parallel the shar entry.
														
 
															+
														
 
															+13-Jun-1997	Dick Grune
														
 
															+	A number of simplifications, in view of better software and bigger
														
 
															+	machines:
														
 
															+	-	Removed good_realloc from hash.c; I don't think there are
														
 
															+		any bad reallocs left.
														
 
															+	-	Removed the option to run without forward_references.
														
 
															+		On a 16Mb machine this means you have at least 2M tokens;
														
 
															+		using a quadratic algorithm will take 4*10^6 sec. at an
														
 
															+		impossible rate of 1M actions/sec., which is some 50 days.
														
 
															+		Forget it.
														
 
															+	-	Renamed lang() to print_stream(), and incorporated it in sim.c
														
 
															+	-	Removed the MSDOS subdirectory mechanism in the Makefile.
														
 
															+	-	Removed the funny and sneaky double parameter expansion in
														
 
															+		the call of idf_in_list().
														
 
															+
														
 
															+12-Jun-1997	Dick Grune
														
 
															+	Converted to ANSI C.  Removed cport.h.
														
 
															+
														
 
															+09-Jan-1995	Dick Grune
														
 
															+	Decided not to do directories: they usually contain extraneous
														
 
															+	files and doing sim * is simple enough anyway.
														
 
															+
														
 
															+09-Sep-1994	Dick Grune
														
 
															+	Added system.h to cater for the (few) differences between Unix and DOS.
														
 
															+	The #define int32 is also supplied there.
														
 
															+
														
 
															+05-Sep-1994	Dick Grune
														
 
															+	Added many prototype declarations using cport.h.
														
 
															+	Added a depend entry to the Makefile.
														
 
															+
														
 
															+31-Aug-1994	Dick Grune
														
 
															+	All these changes require a 32 bit integer; introduced a #define
														
 
															+	int32, set from the command line in the Makefile.
														
 
															+
														
 
															+25-Aug-1994	Dick Grune
														
 
															+	It turned out that one of the most often called routines was .rem,
														
 
															+	from idf_hashed() in idf.c.  Moving the % out of the loop chafed off
														
 
															+	another 6% and reduced the time to 18.4 sec.
														
 
															+
														
 
															+19-Aug-1994	Dick Grune
														
 
															+	With very large files (e.g., concatenated /usr/man/man1/*) the fixed
														
 
															+	built-in hash table size of 10639 is no longer satisfactory.  Hash.c
														
 
															+	now finds a prime about 8 times smaller than the text_size to use
														
 
															+	for hash table size; this achieves optimal speed-up without gobbling
														
 
															+	up too much memory.  Reduced the time for the above file from 30.2
														
 
															+	sec. to 19.6 sec.
														
 
															+	For checking, the same test was run with all hashing off; it took
														
 
															+	20h 27m 19s = 73639 sec.  But it worked.
														
 
															+
														
 
															+11-Aug-1994	Dick Grune
														
 
															+	For large values of MinRunSize (>1000) a large part of the time
														
 
															+	(>two-thirds) was spent in calculating the hash values for each
														
 
															+	position in the input, since the cost of this calculation was
														
 
															+	proportional to MinRunSize.  We now sample a maximum of 24 tokens
														
 
															+	from the input string to calculate the hash value, and avoid
														
 
															+	overflow.  On my workstation, this reduces the time for
														
 
															+		sim_text -r 1000 -n /usr/man/man1/*
														
 
															+	from 60 sec to 21 sec.
														
 
															+
														
 
															+30-Jun-1992	Dick Grune,kamer R4.40,telef. 5778
														
 
															+	There was an amazing bug in buff.c where NextTextToken() for pass 2
														
 
															+	omitted to set lex_token to EOL when retrieving newline info from
														
 
															+	nl_buff. Worked until now!?!
														
 
															+
														
 
															+23-Sep-1991	Dick Grune
														
 
															+	Cport.h introduced, CONST and *.spc only.
														
 
															+
														
 
															+17-Sep-1991	Dick Grune
														
 
															+	The position-sorting routine in pass2.c has been made into a
														
 
															+	separate generic module.
														
 
															+
														
 
															+14-Jun-1991	Dick Grune ([email protected]) at dick.cs.vu.nl
														
 
															+	Replaced the determination of the input position through counting
														
 
															+	input characters by calls of ftell(); this is cleaner and the other
														
 
															+	method will never work on MSDOS.
														
 
															+
														
 
															+30-May-1989	Dick Grune (dick) at dick
														
 
															+	Replaced the old top-100 module (which had been extended to top-10000
														
 
															+	already anyway) by the new aiso (arbitrary-in sorted-out) module.
														
 
															+	This caused a considerable speed-up on the Mod2 test bed:
														
 
															+		 %time  cumsecs  #call  ms/call  name
														
 
															+		  17.9    99.20   7209    13.76  _InsertTop
														
 
															+		   0.3     1.37   7209     0.19  _InsertAiso
														
 
															+	It turns out that malloc() is not a serious problem, so no special
														
 
															+	version for the aiso module is required.
														
 
															+
														
 
															+23-May-1989	Dick Grune (dick) at dick
														
 
															+	No more uncommented comment at the end of preprocessor lines, to
														
 
															+	conform to ANSI C.
														
 
															+
														
 
															+23-May-1989	Dick Grune (dick) at dick
														
 
															+	Added code in the X.l files to (silently) reject characters over 0200.
														
 
															+	This does not really help, since lex stops on null chars. Ah, well.
														
 
															+
														
 
															+19-May-1989	Dick Grune (dick) at dick
														
 
															+	Made the token as handled by sim into an abstract data type, for
														
 
															+	aesthetic reasons. Sign extension is still a problem.
														
 
															+
														
 
															+03-May-1989	Dick Grune (dick) at dick
														
 
															+	Optimized lcs() by first checking from the end if a sufficiently long
														
 
															+	run is present; if in fact only the first 12 tokens match, chances
														
 
															+	are good that you can reject the run right away by first testing
														
 
															+	the 20th token, then the 19th, and so on.
														
 
															+
														
 
															+21-Apr-1989	Dick Grune (dick) at dick
														
 
															+	A run of sim_m2 finding 7209 similarities raised the question of
														
 
															+	the appropriateness of the linear sort in sort_pos(). Profiling
														
 
															+	showed that in this case sorting takes all of 7.5 % of the total
														
 
															+	time. Putting the word register in in the right places in
														
 
															+	sort_pos() lowered this number to 4.6%.
														
 
															+
														
 
															+20-Apr-1989	Dick Grune (dick) at dick
														
 
															+	Moved the test for MayBeStartOfRun() from compare.c (where it is
														
 
															+	done again and again) to hash.c, where its effect is incorporated in
														
 
															+	the forward reference chain.
														
 
															+
														
 
															+14-Apr-1989	Dick Grune (dick) at dick
														
 
															+	Replaced elem_of() by bit tables, headers[] and trailers[], to be
														
 
															+	prefilled from Headers[] and Trailers[] by a call of
														
 
															+	InitLanguage(). This saves a few percents.
														
 
															+
														
 
															+13-Apr-1989	Dick Grune (dick) at dick
														
 
															+	Implemented the -e and the -S option, by putting yet another loop
														
 
															+	in compare.c
														
 
															+
														
 
															+13-Apr-1989	Dick Grune (dick) at dick
														
 
															+	The -- option (displaying the tokens) will now handle more than one
														
 
															+	file.
														
 
															+
														
 
															+20-Jan-1989	Dick Grune (dick) at dick
														
 
															+	After the modification of 19-Dec-88, 12% of the time went into
														
 
															+	updating the positions in the chunks, as they were produced by the
														
 
															+	matching process. This matching process identifies runs (matches)
														
 
															+	by token position, which has to be recalculated to lseek positions
														
 
															+	and line numbers. To this end the files are read again, and for
														
 
															+	each line all positions found were checked to see if they applied
														
 
															+	to this line; this was a awfully stupid algorithm, but since much
														
 
															+	more time was spent elsewhere, it did not really matter. With all
														
 
															+	the saving below, however, it had risen to second position, after
														
 
															+	yylook() with 35%.
														
 
															+
														
 
															+	Th solution was, to sort the positions in the same order in which
														
 
															+	they would be met by the reading of the files. The process is then
														
 
															+	linear. This required some extensive hacking in pass2.c
														
 
															+
														
 
															+06-Jan-1989	Dick Grune (dick) at dick
														
 
															+	The modification below did indeed save 25%. The newline information
														
 
															+	is now reduced to 2 shorts; 2 chars were not enough, since some
														
 
															+	lines are longer that 127 bytes, and a char and a short together
														
 
															+	take as much room as two shorts.
														
 
															+
														
 
															+19-Dec-1988	Dick Grune (dick) at dick
														
 
															+	To avoid reading the files twice (which is still taking 25% of the
														
 
															+	time), the first pass will now collect newline information for the
														
 
															+	second pass in a buffer called nl_buff[].  This buffer, and the
														
 
															+	original token buffer now named TokenArray[], are managed by the file
														
 
															+	buff.c, which implements a layer between stream.h and pass?.c. This
														
 
															+	layer provides OpenText(), NextTextToken() and CloseText(), each
														
 
															+	with a parameter telling which pass it is.
														
 
															+
														
 
															+06-Dec-1988	Dick Grune (dick) at dick
														
 
															+	As an introduction to removing the second pass altogether, the
														
 
															+	first and second scan were unified, i.e., their input is identical.
														
 
															+	This also means that the call sim -[12] has now been replaced by
														
 
															+	one call:  sim --.
														
 
															+
														
 
															+23-Sep-1988	Dick Grune (dick) at dick
														
 
															+	Dynamic allocation of line buffers in pass 3.  This removes the
														
 
															+	restriction on the page width.
														
 
															+
														
 
															+22-Sep-1988	Dick Grune (dick) at dick
														
 
															+	In order to give better messages on incorrect calls to sim, the
														
 
															+	whole option handling has been concentrated in a file option.c and
														
 
															+	separated from the options and their messages themselves. See sim.c
														
 
															+
														
 
															+07-Sep-1988	Dick Grune (dick) at dick
														
 
															+	For long text sequences (say hundreds of thousands of tokens),
														
 
															+	the hashing is not really efficient any more since too many
														
 
															+	spurious matches occur.  Therefore, the forward reference table is
														
 
															+	scanned a second time, eliminating from any chain all references to
														
 
															+	runs that do not end in the same token.  For the UNIX manuals this
														
 
															+	reduced the number of matches from 91.9% to 1.9% (of which 0.06%
														
 
															+	were genuine).
														
 
															+
														
 
															+30-Aug-1988	Dick Grune (dick) at dick
														
 
															+	For compatibility, NextTop has been rewritten to yield true or
														
 
															+	false and to accept a pointer to a run as a parameter.
														
 
															+
														
 
															+30-Aug-1988	Dick Grune (dick) at dick
														
 
															+	When trying to find line-number and lseek position to beginnings
														
 
															+	and ends of runs found, the whole set of runs was scanned for each
														
 
															+	line in each file.  Now only the runs belonging to that file are
														
 
															+	scanned; to this end another linked list has been braided through
														
 
															+	the data structures (tx_chunk).
														
 
															+
														
 
															+30-Aug-1988	Dick Grune (dick) at dick
														
 
															+	The longest-common-substring algorithm was called much too often,
														
 
															+	mainly because the forward references made by hashing suffered from
														
 
															+	pollution.  If you have say 1000 tokens and a hash range of say
														
 
															+	10000, about 5 % of the hashings will be false matches, i.e. 50
														
 
															+	matches, which is quite a lot on a natural number of 2 to 3 matches.
														
 
															+	Improved by doing a second check in make_forw_ref().
														
 
															+
														
 
															+12-Jun-1988	Dick Grune (dick) at dick
														
 
															+	Installed a Lisp version supplied by Gertjan Akkerman.
														
 
															+
														
 
															+15-Jan-1988	Dick Grune (dick) at dick
														
 
															+	Added register declarations all over the place.
														
 
															+
														
 
															+14-Jan-1988	Dick Grune (dick) at dick
														
 
															+	It is often useful to match a piece of code exactly, especially
														
 
															+	when function names (or, even more so, macro names) are involved.
														
 
															+	What one would want is having all the letters in the text array,
														
 
															+	but this is kind of hard, since each entry is one lexical item.
														
 
															+	This means that under the -F option each letter is a lex item, and
														
 
															+	normally each tag is a lex item; this requires two lex grammars in
														
 
															+	one program; no good.  So, on the -F flag we hash the identifier
														
 
															+	into one lex item, which is hopefully characteristic enough.  It
														
 
															+	works.
														
 
															+
														
 
															+30-Sep-1987	Dick Grune (dick) at dick
														
 
															+	Some cosmetics.
														
 
															+
														
 
															+31-Aug-1987	Dick Grune (dick) at dick
														
 
															+	Moved the whole thing to the SUN (while testing on a VAX and a
														
 
															+	MC68000)
														
 
															+
														
 
															+16-Aug-1987	Dick Grune (dick) at dick
														
 
															+	The test program lang.c is no longer a main program, but rather a
														
 
															+	subroutine called in main() in sim.c, through the command line
														
 
															+	option -1 or -2.
														
 
															+
														
 
															+23-Apr-1987	Dick Grune (dick) at tjalk
														
 
															+	Changed the name 'index' into 'elem_of', because of compatibility
														
 
															+	problems on different Unices. Added a declaration for it in
														
 
															+	the file algollike.c
														
 
															+
														
 
															+10-Mar-1987	Dick Grune (dick) at tjalk
														
 
															+	Changed the printing of the header of a run so that:
														
 
															+	-	long file names will no longer be truncated
														
 
															+	-	the run length is displayed
														
 
															+
														
 
															+27-Jan-1987	Dick Grune (dick) at tjalk
														
 
															+	Switched it right off again!  Getting them in textual order is
														
 
															+	still more unpleasant, since now you cannot find the important
														
 
															+	ones if their are more than a few runs.
														
 
															+
														
 
															+27-Jan-1987	Dick Grune (dick) at tjalk
														
 
															+	Going to experiment with leaving out the sorting; just all the
														
 
															+	runs, in the order we meet them.  Should be as good or better.
														
 
															+	Comparisons of more than 100 runs are very rare anyway, so the
														
 
															+	fact that those over a 100 are rejected is probably no great
														
 
															+	help.  Getting them in a funny order is a nuisance, however.  Down
														
 
															+	with featurism.  Just to be safe, present version saved as
														
 
															+	870127.SV
														
 
															+
														
 
															+26-Dec-1986	Dick Grune (dick) at tjalk
														
 
															+	Names of overall parameters in params.h changed to more uniformity.
														
 
															+
														
 
															+26-Dec-1986	Dick Grune (dick) at tjalk
														
 
															+	Since the top package and the instantiation system have grown
														
 
															+	apart so much, I have integrated the old top package into sim,
														
 
															+	i.e., done the instantiation by hand.  This removes top.g and
														
 
															+	top.p, and will save outsiders from wondering what is going on
														
 
															+	here.
														
 
															+
														
 
															+23-Dec-1986	Dick Grune (dick) at tjalk
														
 
															+	Use setbuf to print unbuffered while reading the files (lex core
														
 
															+	dumps, other mishaps) and print buffered while printing the real
														
 
															+	output (for speed).
														
 
															+
														
 
															+30-Nov-1986	Dick Grune (dick) at tjalk
														
 
															+	Various small changes in *lang.l:
														
 
															+		; ignored conditionally (!options['f'])
														
 
															+		new format for tokens in struct idf
														
 
															+		cosmetics: macro Layout, macro UnsafeComChar, no \n
														
 
															+			in character denotations, more than one char
														
 
															+			in a char denotations in Pascal, etc.
														
 
															+
														
 
															+30-Nov-1986	Dick Grune (dick) at tjalk
														
 
															+	Added a Modula-2 version.
														
 
															+
														
 
															+29-Nov-1986	Dick Grune (dick) at tjalk
														
 
															+	Restricting tokens to the ASCII95 character set is really too
														
 
															+	severe: some languages have many more reserved words (COBOL!).
														
 
															+	Corrected this by adding a couple of '&0377' in strategic places.
														
 
															+	Added a routine for printing the 8-bit beasties: show_token().
														
 
															+
														
 
															+15-Aug-1986	Dick Grune (dick) at tjalk
														
 
															+	Since the ; is superfluous in both C and Pascal, it is now ignored
														
 
															+	by clang.l and pascallang.l
														
 
															+
														
 
															+15-Aug-1986	Dick Grune (dick) at tjalk
														
 
															+	The code in CheckRun in Xlang.l was incorrect in that it used the
														
 
															+	wrong criterion for throwing away trailing garbage. I've taken
														
 
															+	CheckRun etc. out of the Xlang.l-s and turned them into a module
														
 
															+	"algollike.c".  Made a cleaner interface and avoided duplication of
														
 
															+	code.
														
 
															+
														
 
															+02-Jul-1986	Dick Grune (dick) at tjalk
														
 
															+	Looking backwards in compare.c to see if we are in the middle of a
														
 
															+	run is an atavism. You can be and still be all right, e.g., if
														
 
															+	part of the run was rejected as not fitting for a function.
														
 
															+	Removed from compare.c.
														
 
															+
														
 
															+10-Jun-1986	Dick Grune (dick) at tjalk
														
 
															+	The function hash_code() in hash.c could yield a negative value;
														
 
															+	corrected.
														
 
															+
														
 
															+09-Jun-1986	Dick Grune (dick) at tjalk
														
 
															+	Changed the name of the file text.h to sim.h.  Sim.h is more
														
 
															+	appropriate and text.h sounds as if it belongs to text.l, with
														
 
															+	which it has no connection.
														
 
															+
														
 
															+04-Jun-1986	Dick Grune (dick) at tjalk
														
 
															+	After having looked at a couple of hash functions and having done
														
 
															+	some calculations on the number of duplicates normally encountered
														
 
															+	in hash functions, I conclude that our function in hash.c is quite
														
 
															+	good.  Removed all the statistics-gathering stuff.
														
 
															+	
														
 
															+	Actually, hash_table[] is not the hash table at all; it is a
														
 
															+	forward reference table; likewise, the real hash table was called
														
 
															+	last[].  Renamed both.
														
 
															+	
														
 
															+	There is a way to keep the hash table local without putting it on
														
 
															+	the stack: use malloc().
														
 
															+
														
 
															+02-Jun-1986	Dick Grune (dick) at tjalk
														
 
															+	Added a simple lex file for text: each word is condensed into a
														
 
															+	hash code which is mapped on the ASCII95 character set.  This
														
 
															+	turns out to be quite effective.
														
 
															+
														
 
															+01-Jun-1986	Dick Grune (dick) at tjalk
														
 
															+	The macros cput(tk) and c_eol() both have a return in them, so any
														
 
															+	code after them may not be executed -> they have to be last in an
														
 
															+	entry.  But they weren't, in many places; I can't imagine why it
														
 
															+	all worked nevertheless.  They have been renamed return_tk(tk) and
														
 
															+	return_eol() and the entries have been restructured.
														
 
															+
														
 
															+30-May-1986	Dick Grune (dick) at tjalk
														
 
															+	Moved the string and character entries in clang.l and pascallang.l
														
 
															+	to a place behind the comment entries, to avoid strings (and
														
 
															+	characters) being recognized inside comments.  I first thought
														
 
															+	this would not happen, but as Maarten pointed out, if both
														
 
															+	interpretations have the same length, lex will take the first
														
 
															+	entry. Now this will happen if the string occupies the whole line
														
 
															+	that would otherwise be taken as a comment.  In short,
														
 
															+	/*
														
 
															+	"hallo"
														
 
															+	*/
														
 
															+	would return ".
														
 
															+
														
 
															+28-May-1986	Dick Grune (dick) at tjalk
														
 
															+	Added -d option, to display the output in diff(1) format (courtesy
														
 
															+	of Maarten van der Meulen).
														
 
															+	Rewrote the lexical parsing of comments (likewise courtesy Maarten
														
 
															+	van der Meulen).
														
 
															+
														
 
															+20-May-1986	Dick Grune (dick) at tjalk
														
 
															+	Added a routine to convert identifiers to lower case in
														
 
															+	pascallang.l .
														
 
															+
														
 
															+19-May-1986	Dick Grune (dick) at tjalk
														
 
															+	Added -a option, to quickly check antecedent of a file (courtesy
														
 
															+	of Maarten van der Meulen).
														
 
															+
														
 
															+18-May-1986	Dick Grune (dick) at tjalk
														
 
															+	Brought everything under RCS/CVS.
														
 
															+
														
 
															+18-Mar-1986	Dick Grune (dick) at tjalk
														
 
															+	Added modifications by Paul Bame (hp-lsd!paul@hp-labs) to have an
														
 
															+	option -w to set the page width.
														
 
															+
														
 
															+21-Feb-1986	Dick Grune (dick) at tjalk
														
 
															+	Took array last[N_HASH] out of make_hash() in hash.c, due to stack
														
 
															+	overflow on the Gould (reported by George Walker
														
 
															+	[email protected])
														
 
															+
														
 
															+16-Feb-1986	Dick Grune (dick) at tjalk
														
 
															+	Corrected some subtractions that caused unsigned ints to turn
														
 
															+	pseudo-negative. (Reported by jaap@mcvax)
														
 
															+
														
 
															+11-Jan-1986	Dick Grune (dick) at tjalk
														
 
															+	Touched up for distribution.
														
 
															+
														
 
															+10-Jan-1986	Dick Grune (dick) at tjalk
														
 
															+	Fill_line was not called for empty lines, which caused them to be
														
 
															+	printed as repetitions of the previous line.
														
 
															+
														
 
															+24-Dec-1985	Dick Grune (dick) at tjalk
														
 
															+	Reduced hash table to a single array of indices; it is used only
														
 
															+	in one place, which makes it very easy to make it (the hash table)
														
 
															+	optional.  General tune-up of everything.  This seems to be
														
 
															+	another stable "final" version.
														
 
															+
														
 
															+14-Dec-1985	Dick Grune (dick) at tjalk
														
 
															+	Some experiments with hash formulas:
														
 
															+	h = (h OP CST) + *p++ OP CST yields	right	wrong
														
 
															+		* 96		- 32		205	562
														
 
															+		* 96		- 2		205	560
														
 
															+		* 96				205	560
														
 
															+		* 97				205	559
														
 
															+		<< 0				 66	3128
														
 
															+		<< 1				203	555
														
 
															+		<< 2				205	536
														
 
															+		<< 7				203	540
														
 
															+	Conclusion: it doesn't matter, unless you do it wrong.
														
 
															+
														
 
															+01-Oct-1983	Dic8k Grune (dick) at vu44
														
 
															+	Oldest known files.
														
 
															+
														
 
															+#	This file is part of the software similarity tester SIM.
														
 
															+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+#	$Id: ChangeLog,v 2.12 2007/08/27 09:57:30 dick Exp $
														
 
															+#
														
--- a/utils/sim_pasc/LICENSE.txt
+++ b/utils/sim_pasc/LICENSE.txt
@@ -0,0 +1,31 @@
 
															+Copyright (c) 1986, 2007, Dick Grune, Vrije Universiteit, The Netherlands
														
 
															+All rights reserved.
														
 
															+
														
 
															+Redistribution and use in source and binary forms,
														
 
															+with or without modification, are permitted provided
														
 
															+that the following conditions are met:
														
 
															+
														
 
															+   * Redistributions of source code must retain the above copyright
														
 
															+     notice, this list of conditions and the following disclaimer.
														
 
															+
														
 
															+   * Redistributions in binary form must reproduce the above
														
 
															+     copyright notice, this list of conditions and the following
														
 
															+     disclaimer in the documentation and/or other materials provided
														
 
															+     with the distribution.
														
 
															+
														
 
															+   * Neither the name of the Vrije Universiteit nor the names of its
														
 
															+     contributors may be used to endorse or promote products derived
														
 
															+     from this software without specific prior written permission.
														
 
															+
														
 
															+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
														
 
															+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
														
 
															+NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
														
 
															+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
														
 
															+IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
														
 
															+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
														
 
															+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
														
 
															+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
														
 
															+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
														
 
															+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
														
 
															+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
														
 
															+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
														
--- a/utils/sim_pasc/Makefile
+++ b/utils/sim_pasc/Makefile
@@ -0,0 +1,566 @@
 
															+#	This file is part of the software similarity tester SIM.
														
 
															+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+#	$Id: Makefile,v 2.17 2007/08/27 09:57:31 dick Exp $
														
 
															+#
														
 
															+
														
 
															+
														
 
															+#	E N T R Y   P O I N T S
														
 
															+test_sim:
														
 
															+
														
 
															+help:
														
 
															+	@echo 'Entry points:'
														
 
															+	@echo 'test_sim:	compile sim_c and run a simple test (default)'
														
 
															+	@echo ''
														
 
															+	@echo 'all:		create all binaries'
														
 
															+	@echo 'sim_X(.exe):	create specific binary for language X'
														
 
															+	@echo 'install_all:	install all binaries'
														
 
															+	@echo 'install.sim_X:	install specific binary for language X'
														
 
															+	@echo 'where X is one of c, java, pasc, m2, lisp, mira, text'
														
 
															+	@echo ''
														
 
															+	@echo 'lint:		lint sim_c sources'
														
 
															+	@echo 'lint.all:	lint all sim sources'
														
 
															+	@echo 'simsim:		run sim on the sim sources'
														
 
															+	@echo ''
														
 
															+	@echo 'simsrc.shr:	create sources shar file'
														
 
															+	@echo 'simsrc.zip:	create sources zip file'
														
 
															+	@echo 'depend:		update dependencies in Makefile'
														
 
															+	@echo 'clean:		remove created files'
														
 
															+	@echo ''
														
 
															+	@echo 'sim_exes:	create DOS executables in MSDOS; set date; make clean'
														
 
															+	@echo 'simexe.zip:	create DOS executables package in UNIX'
														
 
															+	@echo 'VERSION=2.X install_ftp:	install in the FTP directory in UNIX'
														
 
															+
														
 
															+VERSION =	2_21
														
 
															+
														
 
															+#
														
 
															+# When you modify any of the following flags, do 'make clean'
														
 
															+#
														
 
															+
														
 
															+include		sysidf.mk
														
 
															+
														
 
															+
														
 
															+# Flags
														
 
															+OPTLEVEL =	-O4#			#
														
 
															+
														
 
															+CFLAGS =	$(SYSTEM) $(OPTLEVEL) $(TESTTOKEN)
														
 
															+LFLAGS =	#			# loader flags
														
 
															+
														
 
															+TESTTOKEN =	#-DTESTTOKEN#		# define to test the token type
														
 
															+
														
 
															+
														
 
															+#	T E S T   P A R A M E T E R S
														
 
															+
														
 
															+# Parameters for two simple test runs, sim.res and stream.res:
														
 
															+TEST_LANG =	c#			# to test sim_X for language X
														
 
															+TEST_OPT =	-f -r 20#		# options to sim_X
														
 
															+TEST_INP =	pass3.c#		# guinea pig input
														
 
															+
														
 
															+TEST_OPT =	-p#			# options to sim_X
														
 
															+TEST_INP =	*.l#			# guinea pig input
														
 
															+TEST_INP =	simple*#		# guinea pig input
														
 
															+
														
 
															+
														
 
															+#	I N T R O D U C T I O N
														
 
															+
														
 
															+#	Each module (set of programs that together perform some function)
														
 
															+#	has the following sets of files defined for it:
														
 
															+#		_FLS	all files of that module, for, e.g.,
														
 
															+#			sharring, inventory, etc.
														
 
															+#		_SRC	the source files, from which other files derive
														
 
															+#		_CFS	the C-files, from which the object files derive
														
 
															+#		_OBJ	object files
														
 
															+#		_GRB	garbage files produced by compiling the module
														
 
															+#
														
 
															+#	(This is a feeble attempt at software-engineering a Makefile.)
														
 
															+#
														
 
															+
														
 
															+test_sim:	sim.res stream.res	# two simple tests
														
 
															+
														
 
															+
														
 
															+#	B I N A R I E S
														
 
															+
														
 
															+BINARIES =	sim_c$(EXE) sim_java$(EXE) sim_pasc$(EXE) \
														
 
															+		sim_m2$(EXE) sim_lisp$(EXE) sim_mira$(EXE) \
														
 
															+		sim_text$(EXE)
														
 
															+
														
 
															+all:		$(BINARIES)
														
 
															+
														
 
															+
														
 
															+#	C O M P I L A T I O N   R U L E S
														
 
															+
														
 
															+.SUFFIXES:	.o
														
 
															+.c.o:
														
 
															+		$(CC) -c $(CFLAGS) $<
														
 
															+
														
 
															+
														
 
															+#	A U X I L I A R Y   M O D U L E S
														
 
															+
														
 
															+# Common modules:
														
 
															+COM_CFS =	token.c lex.c stream.c text.c tokenarray.c error.c
														
 
															+COM_OBJ =	token.o lex.o stream.o text.o tokenarray.o error.o
														
 
															+COM_SRC =	token.h lex.h stream.h text.h tokenarray.h error.h \
														
 
															+		lang.h language.h \
														
 
															+		sortlist.spc sortlist.bdy system.par $(COM_CFS)
														
 
															+COM_FLS =	$(COM_SRC)
														
 
															+
														
 
															+# The idf module:
														
 
															+IDF_CFS =	idf.c
														
 
															+IDF_OBJ =	idf.o
														
 
															+IDF_SRC =	idf.h $(IDF_CFS)
														
 
															+IDF_FLS =	$(IDF_SRC)
														
 
															+
														
 
															+# The runs package:
														
 
															+RUNS_CFS =	runs.c percentages.c
														
 
															+RUNS_OBJ =	runs.o percentages.o
														
 
															+RUNS_SRC =	runs.h percentages.h $(RUNS_CFS)
														
 
															+RUNS_FLS =	$(RUNS_SRC) aiso.spc aiso.bdy
														
 
															+
														
 
															+# The main program:
														
 
															+MAIN_CFS =	sim.c options.c pass1.c hash.c compare.c add_run.c \
														
 
															+		pass2.c pass3.c
														
 
															+MAIN_OBJ =	sim.o options.o pass1.o hash.o compare.o add_run.o \
														
 
															+		pass2.o pass3.o
														
 
															+MAIN_SRC =	sim.h options.h pass1.h hash.h compare.h add_run.h \
														
 
															+		pass2.h pass3.h \
														
 
															+		debug.par settings.par $(MAIN_CFS)
														
 
															+MAIN_FLS =	$(MAIN_SRC)
														
 
															+
														
 
															+# The similarity tester without the language part:
														
 
															+SIM_CFS =	$(COM_CFS) $(IDF_CFS) $(RUNS_CFS) $(MAIN_CFS)
														
 
															+SIM_OBJ =	$(COM_OBJ) $(IDF_OBJ) $(RUNS_OBJ) $(MAIN_OBJ)
														
 
															+SIM_SRC =	$(COM_SRC) $(IDF_SRC) $(RUNS_SRC) $(MAIN_SRC)
														
 
															+SIM_FLS =	$(COM_FLS) $(IDF_FLS) $(RUNS_FLS) $(MAIN_FLS)
														
 
															+
														
 
															+
														
 
															+#	L A N G U A G E S
														
 
															+
														
 
															+# The algollike module:
														
 
															+ALG_CFS =	algollike.c
														
 
															+ALG_OBJ =	algollike.o
														
 
															+ALG_SRC =	algollike.h $(ALG_CFS)
														
 
															+ALG_FLS =	$(ALG_SRC)
														
 
															+
														
 
															+# The C Language module:					C
														
 
															+CLANG_CFS =	clang.c
														
 
															+CLANG_OBJ =	clang.o
														
 
															+CLANG_SRC =	clang.l
														
 
															+CLANG_FLS =	$(CLANG_SRC)
														
 
															+
														
 
															+clang.c:	clang.l
														
 
															+		$(LEX) -t clang.l >$@
														
 
															+
														
 
															+SIM_C_CFS =	$(SIM_CFS) $(ALG_CFS) $(CLANG_CFS)
														
 
															+SIM_C_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(CLANG_OBJ)
														
 
															+
														
 
															+sim_c$(EXE):	$(SIM_C_OBJ)
														
 
															+		$(CC) $(LFLAGS) $(SIM_C_OBJ) -o $@
														
 
															+
														
 
															+SIM_C_GRB =	clang.c sim_c
														
 
															+
														
 
															+install.sim_c:	$(BINDIR)/sim_c$(EXE) $(MANDIR)/sim.1
														
 
															+
														
 
															+$(BINDIR)/sim_c$(EXE):	sim_c$(EXE)
														
 
															+		$(COPY) sim_c$(EXE) $@
														
 
															+
														
 
															+# The Java Language module:					Java
														
 
															+JAVALANG_CFS =	javalang.c
														
 
															+JAVALANG_OBJ =	javalang.o
														
 
															+JAVALANG_SRC =	javalang.l
														
 
															+JAVALANG_FLS =	$(JAVALANG_SRC)
														
 
															+
														
 
															+javalang.c:	javalang.l
														
 
															+		$(LEX) -t javalang.l >$@
														
 
															+
														
 
															+SIM_JAVA_CFS =	$(SIM_CFS) $(ALG_CFS) $(JAVALANG_CFS)
														
 
															+SIM_JAVA_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(JAVALANG_OBJ)
														
 
															+
														
 
															+sim_java$(EXE):	$(SIM_JAVA_OBJ)
														
 
															+		$(CC) $(LFLAGS) $(SIM_JAVA_OBJ) -o $@
														
 
															+
														
 
															+SIM_JAVA_GRB =	javalang.c sim_java
														
 
															+
														
 
															+install.sim_java:	$(BINDIR)/sim_java$(EXE) $(MANDIR)/sim.1
														
 
															+
														
 
															+$(BINDIR)/sim_java$(EXE):	sim_java$(EXE)
														
 
															+		$(COPY) sim_java$(EXE) $@
														
 
															+
														
 
															+# The Pascal Language module:					Pascal
														
 
															+PASCLANG_CFS =	pascallang.c
														
 
															+PASCLANG_OBJ =	pascallang.o
														
 
															+PASCLANG_SRC =	pascallang.l
														
 
															+PASCLANG_FLS =	$(PASCLANG_SRC)
														
 
															+
														
 
															+pascallang.c:	pascallang.l
														
 
															+		$(LEX) -t pascallang.l >pascallang.c
														
 
															+
														
 
															+SIM_PASC_CFS =	$(SIM_CFS) $(ALG_CFS) $(PASCLANG_CFS)
														
 
															+SIM_PASC_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(PASCLANG_OBJ)
														
 
															+
														
 
															+sim_pasc$(EXE):	$(SIM_PASC_OBJ)
														
 
															+		$(CC) $(LFLAGS) $(SIM_PASC_OBJ) -o $@
														
 
															+
														
 
															+SIM_PASC_GRB =	pascallang.c sim_pasc
														
 
															+
														
 
															+install.sim_pasc:	$(BINDIR)/sim_pasc$(EXE) $(MANDIR)/sim.1
														
 
															+
														
 
															+$(BINDIR)/sim_pasc$(EXE):	sim_pasc$(EXE)
														
 
															+		$(COPY) sim_pasc$(EXE) $@
														
 
															+
														
 
															+# The Modula-2 Language module:					Modula-2
														
 
															+M2LANG_CFS =	m2lang.c
														
 
															+M2LANG_OBJ =	m2lang.o
														
 
															+M2LANG_SRC =	m2lang.l
														
 
															+M2LANG_FLS =	$(M2LANG_SRC)
														
 
															+
														
 
															+m2lang.c:	m2lang.l
														
 
															+		$(LEX) -t m2lang.l >$@
														
 
															+
														
 
															+SIM_M2_CFS =	$(SIM_CFS) $(ALG_CFS) $(M2LANG_CFS)
														
 
															+SIM_M2_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(M2LANG_OBJ)
														
 
															+
														
 
															+sim_m2$(EXE):	$(SIM_M2_OBJ)
														
 
															+		$(CC) $(LFLAGS) $(SIM_M2_OBJ) -o $@
														
 
															+
														
 
															+SIM_M2_GRB =	m2lang.c sim_m2
														
 
															+
														
 
															+install.sim_m2:	$(BINDIR)/sim_m2$(EXE) $(MANDIR)/sim.1
														
 
															+
														
 
															+$(BINDIR)/sim_m2$(EXE):	sim_m2$(EXE)
														
 
															+		$(COPY) sim_m2$(EXE) $@
														
 
															+
														
 
															+# The Lisp Language module:					Lisp
														
 
															+LISPLANG_CFS =	lisplang.c
														
 
															+LISPLANG_OBJ =	lisplang.o
														
 
															+LISPLANG_SRC =	lisplang.l
														
 
															+LISPLANG_FLS =	$(LISPLANG_SRC)
														
 
															+
														
 
															+lisplang.c:	lisplang.l
														
 
															+		$(LEX) -t lisplang.l >$@
														
 
															+
														
 
															+SIM_LISP_CFS =	$(SIM_CFS) $(ALG_CFS) $(LISPLANG_CFS)
														
 
															+SIM_LISP_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(LISPLANG_OBJ)
														
 
															+
														
 
															+sim_lisp$(EXE):	$(SIM_LISP_OBJ)
														
 
															+		$(CC) $(LFLAGS) $(SIM_LISP_OBJ) -o $@
														
 
															+
														
 
															+SIM_LISP_GRB =	lisplang.c sim_lisp
														
 
															+
														
 
															+install.sim_lisp:	$(BINDIR)/sim_lisp$(EXE) $(MANDIR)/sim.1
														
 
															+
														
 
															+$(BINDIR)/sim_lisp$(EXE):	sim_lisp$(EXE)
														
 
															+		$(COPY) sim_lisp$(EXE) $@
														
 
															+
														
 
															+# The Miranda Language module:					Miranda
														
 
															+MIRALANG_CFS =	miralang.c
														
 
															+MIRALANG_OBJ =	miralang.o
														
 
															+MIRALANG_SRC =	miralang.l
														
 
															+MIRALANG_FLS =	$(MIRALANG_SRC)
														
 
															+
														
 
															+miralang.c:	miralang.l
														
 
															+		$(LEX) -t miralang.l >$@
														
 
															+
														
 
															+SIM_MIRA_CFS =	$(SIM_CFS) $(ALG_CFS) $(MIRALANG_CFS)
														
 
															+SIM_MIRA_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(MIRALANG_OBJ)
														
 
															+
														
 
															+sim_mira$(EXE):	$(SIM_MIRA_OBJ)
														
 
															+		$(CC) $(LFLAGS) $(SIM_MIRA_OBJ) -o $@
														
 
															+
														
 
															+SIM_MIRA_GRB =	miralang.c sim_mira
														
 
															+
														
 
															+install.sim_mira:	$(BINDIR)/sim_mira$(EXE) $(MANDIR)/sim.1
														
 
															+
														
 
															+$(BINDIR)/sim_mira$(EXE):	sim_mira$(EXE)
														
 
															+		$(COPY) sim_mira$(EXE) $@
														
 
															+
														
 
															+# The Text module:						Text
														
 
															+TEXTLANG_CFS =	textlang.c
														
 
															+TEXTLANG_OBJ =	textlang.o
														
 
															+TEXTLANG_SRC =	textlang.l
														
 
															+TEXTLANG_FLS =	$(TEXTLANG_SRC)
														
 
															+
														
 
															+textlang.c:	textlang.l
														
 
															+		$(LEX) -t textlang.l >$@
														
 
															+
														
 
															+SIM_TEXT_CFS =	$(SIM_CFS) $(TEXTLANG_CFS)
														
 
															+SIM_TEXT_OBJ =	$(SIM_OBJ) $(TEXTLANG_OBJ)
														
 
															+
														
 
															+sim_text$(EXE):	$(SIM_TEXT_OBJ)
														
 
															+		$(CC) $(LFLAGS) $(SIM_TEXT_OBJ) -o $@
														
 
															+
														
 
															+SIM_TEXT_GRB =	textlang.c sim_text
														
 
															+
														
 
															+install.sim_text:	$(BINDIR)/sim_text$(EXE) $(MANDIR)/sim.1
														
 
															+
														
 
															+$(BINDIR)/sim_text$(EXE):	sim_text$(EXE)
														
 
															+		$(COPY) sim_text$(EXE) $@
														
 
															+
														
 
															+
														
 
															+#	T E S T S
														
 
															+
														
 
															+# Some simple tests:
														
 
															+sim.res:	sim_$(TEST_LANG)$(EXE) $(TEST_INP)
														
 
															+		./sim_$(TEST_LANG)$(EXE) $(TEST_OPT) $(TEST_INP)
														
 
															+#		./sim_$(TEST_LANG)$(EXE) -x $(TEST_OPT) $(TEST_INP)
														
 
															+
														
 
															+stream.res:	sim_$(TEST_LANG)$(EXE) $(TEST_INP)
														
 
															+		./sim_$(TEST_LANG)$(EXE) -- $(TEST_INP) >stream.res
														
 
															+		wc stream.res $(TEST_INP)
														
 
															+
														
 
															+TEST_GRB =	stream.res
														
 
															+
														
 
															+# More simple tests, on the C version only:
														
 
															+simsim:		sim_c$(EXE) $(SRC)
														
 
															+		./sim_c$(EXE) -fr 20 $(SRC)
														
 
															+
														
 
															+# Lint
														
 
															+lint:		$(SIM_C_CFS)
														
 
															+		$(LINT) $(LINTFLAGS) $(SIM_C_CFS) | grep -v yy
														
 
															+
														
 
															+lint.all:	$(SIM_C_CFS) $(SIM_JAVA_CFS) $(SIM_PASC_CFS) $(SIM_M2_CFS) \
														
 
															+		$(SIM_LISP_CFS) $(SIM_MIRA_CFS) $(SIM_TEXT_CFS)
														
 
															+		$(LINT) $(LINTFLAGS) $(SIM_C_CFS) | grep -v yy
														
 
															+		$(LINT) $(LINTFLAGS) $(SIM_JAVA_CFS) | grep -v yy
														
 
															+		$(LINT) $(LINTFLAGS) $(SIM_PASC_CFS) | grep -v yy
														
 
															+		$(LINT) $(LINTFLAGS) $(SIM_M2_CFS) | grep -v yy
														
 
															+		$(LINT) $(LINTFLAGS) $(SIM_LISP_CFS) | grep -v yy
														
 
															+		$(LINT) $(LINTFLAGS) $(SIM_MIRA_CFS) | grep -v yy
														
 
															+		$(LINT) $(LINTFLAGS) $(SIM_TEXT_CFS) | grep -v yy
														
 
															+
														
 
															+
														
 
															+#	O T H E R   E N T R I E S
														
 
															+
														
 
															+# Sets of files: general, modules, main programs, languages
														
 
															+CFS =		$(SIM_CFS) $(ALG_CFS) \
														
 
															+		$(CLANG_CFS) $(JAVALANG_CFS) $(PASCLANG_CFS) $(M2LANG_CFS) \
														
 
															+		$(LISPLANG_CFS) $(MIRALANG_CFS) $(TEXTLANG_CFS)
														
 
															+OBJ =		$(SIM_OBJ) $(ALG_OBJ) \
														
 
															+		$(CLANG_OBJ) $(JAVALANG_OBJ) $(PASCLANG_OBJ) $(M2LANG_OBJ) \
														
 
															+		$(LISPLANG_OBJ) $(MIRALANG_OBJ) $(TEXTLANG_OBJ)
														
 
															+SRC =		$(SIM_SRC) $(ALG_SRC) \
														
 
															+		$(CLANG_SRC) $(JAVALANG_SRC) $(PASCLANG_SRC) $(M2LANG_SRC) \
														
 
															+		$(LISPLANG_SRC) $(MIRALANG_SRC) $(TEXTLANG_SRC)
														
 
															+FLS =		$(SIM_FLS) $(ALG_FLS) \
														
 
															+		$(CLANG_FLS) $(JAVALANG_FLS) $(PASCLANG_FLS) $(M2LANG_FLS) \
														
 
															+		$(LISPLANG_FLS) $(MIRALANG_FLS) $(TEXTLANG_FLS) \
														
 
															+		sysidf.mk sysidf.msdos sysidf.unix
														
 
															+DOC =		READ_ME READ.ME README.1st sim.1 sim.txt sim.html \
														
 
															+		ChangeLog Answers TechnReport
														
 
															+
														
 
															+ALL_FLS =	Makefile $(FLS) $(DOC)
														
 
															+
														
 
															+# Create .EXE archive for MSDOS
														
 
															+SIM_EXES =	sim_c.exe sim_java.exe sim_pasc.exe sim_m2.exe \
														
 
															+		sim_lisp.exe sim_mira.exe sim_text.exe
														
 
															+DOSZIP =	READ.ME sim.txt $(SIM_EXES)
														
 
															+sim_exes:	$(SIM_EXES)
														
 
															+
														
 
															+simexe.zip:	$(DOSZIP)
														
 
															+		$(ZIP) $@ $(DOSZIP)
														
 
															+
														
 
															+DOS_GRB =	simexe.zip
														
 
															+
														
 
															+# Install and clean scripts
														
 
															+install_all:	install			# just a synonym
														
 
															+install:	install.sim_c install.sim_java install.sim_pasc \
														
 
															+		install.sim_m2 install.sim_lisp install.sim_mira \
														
 
															+		install.sim_text
														
 
															+
														
 
															+$(MANDIR)/sim.1:	sim.1
														
 
															+		$(COPY) sim.1 $@
														
 
															+
														
 
															+FTPFILES =	README.1st READ_ME LICENSE.txt TechnReport
														
 
															+
														
 
															+install_ftp:	$(FTPFILES) simsrc.shr simexe.zip sim.pdf
														
 
															+		cp -p simsrc.shr sim_$(VERSION).shar
														
 
															+		cp -p simexe.zip sim_$(VERSION).zip
														
 
															+		cp -p $(FTPFILES) sim_$(VERSION).shar sim_$(VERSION).zip \
														
 
															+			README.1st READ.ME READ_ME sim.pdf \
														
 
															+			$(FTPDIR)/.
														
 
															+		rm -f sim_$(VERSION).shar sim_$(VERSION).zip
														
 
															+		ls -l $(FTPDIR)/.
														
 
															+
														
 
															+simsrc.shr:	$(ALL_FLS)
														
 
															+		shar $(ALL_FLS) >$@
														
 
															+
														
 
															+simsrc.zip:	$(ALL_FLS)
														
 
															+		$(ZIP) $@ $(ALL_FLS)
														
 
															+
														
 
															+sim.txt:	sim.1
														
 
															+		nroff -man sim.1 | sed 's/.//g' >$@
														
 
															+
														
 
															+sim.pdf:	sim.1
														
 
															+		troff -man sim.1 | devps | ps2pdf -sPAPERSIZE=a4 - $@
														
 
															+
														
 
															+INSTALL_GRB =	simsrc.shr simsrc.zip sim.txt sim.pdf
														
 
															+
														
 
															+depend:		$(CFS)
														
 
															+		makedepend -w 1 -Dlint $(CFS)
														
 
															+
														
 
															+.PHONY:		clean fresh
														
 
															+clean:
														
 
															+		-rm -f *.o
														
 
															+		-rm -f $(SIM_C_GRB)
														
 
															+		-rm -f $(SIM_JAVA_GRB)
														
 
															+		-rm -f $(SIM_PASC_GRB)
														
 
															+		-rm -f $(SIM_M2_GRB)
														
 
															+		-rm -f $(SIM_LISP_GRB)
														
 
															+		-rm -f $(SIM_MIRA_GRB)
														
 
															+		-rm -f $(SIM_TEXT_GRB)
														
 
															+		-rm -f $(TEST_GRB)
														
 
															+		-rm -f $(INSTALL_GRB)
														
 
															+		-rm -f a.out a.exe sim.txt core mon.out
														
 
															+
														
 
															+fresh:		clean
														
 
															+		-rm -f $(DOS_GRB)
														
 
															+		-rm -f *.exe
														
 
															+
														
 
															+#	D E P E N D E N C I E S
														
 
															+
														
 
															+# DO NOT DELETE THIS LINE -- make depend depends on it.
														
 
															+
														
 
															+token.o: token.h
														
 
															+lex.o: token.h
														
 
															+lex.o: lex.h
														
 
															+stream.o: system.par
														
 
															+stream.o: token.h
														
 
															+stream.o: lex.h
														
 
															+stream.o: lang.h
														
 
															+stream.o: stream.h
														
 
															+text.o: debug.par
														
 
															+text.o: sim.h
														
 
															+text.o: token.h
														
 
															+text.o: stream.h
														
 
															+text.o: lex.h
														
 
															+text.o: options.h
														
 
															+text.o: error.h
														
 
															+text.o: text.h
														
 
															+tokenarray.o: error.h
														
 
															+tokenarray.o: lex.h
														
 
															+tokenarray.o: token.h
														
 
															+tokenarray.o: tokenarray.h
														
 
															+error.o: sim.h
														
 
															+error.o: error.h
														
 
															+idf.o: system.par
														
 
															+idf.o: token.h
														
 
															+idf.o: idf.h
														
 
															+runs.o: sim.h
														
 
															+runs.o: runs.h
														
 
															+runs.o: aiso.spc
														
 
															+runs.o: aiso.bdy
														
 
															+percentages.o: sim.h
														
 
															+percentages.o: runs.h
														
 
															+percentages.o: aiso.spc
														
 
															+percentages.o: error.h
														
 
															+percentages.o: percentages.h
														
 
															+percentages.o: sortlist.bdy
														
 
															+sim.o: settings.par
														
 
															+sim.o: sim.h
														
 
															+sim.o: options.h
														
 
															+sim.o: language.h
														
 
															+sim.o: token.h
														
 
															+sim.o: error.h
														
 
															+sim.o: hash.h
														
 
															+sim.o: compare.h
														
 
															+sim.o: pass1.h
														
 
															+sim.o: pass2.h
														
 
															+sim.o: pass3.h
														
 
															+sim.o: stream.h
														
 
															+sim.o: lex.h
														
 
															+options.o: options.h
														
 
															+pass1.o: debug.par
														
 
															+pass1.o: sim.h
														
 
															+pass1.o: text.h
														
 
															+pass1.o: tokenarray.h
														
 
															+pass1.o: token.h
														
 
															+pass1.o: lex.h
														
 
															+pass1.o: error.h
														
 
															+pass1.o: pass1.h
														
 
															+hash.o: system.par
														
 
															+hash.o: debug.par
														
 
															+hash.o: sim.h
														
 
															+hash.o: error.h
														
 
															+hash.o: language.h
														
 
															+hash.o: token.h
														
 
															+hash.o: tokenarray.h
														
 
															+hash.o: options.h
														
 
															+hash.o: hash.h
														
 
															+compare.o: sim.h
														
 
															+compare.o: tokenarray.h
														
 
															+compare.o: token.h
														
 
															+compare.o: hash.h
														
 
															+compare.o: language.h
														
 
															+compare.o: options.h
														
 
															+compare.o: add_run.h
														
 
															+compare.o: compare.h
														
 
															+add_run.o: sim.h
														
 
															+add_run.o: runs.h
														
 
															+add_run.o: aiso.spc
														
 
															+add_run.o: percentages.h
														
 
															+add_run.o: options.h
														
 
															+add_run.o: error.h
														
 
															+add_run.o: add_run.h
														
 
															+pass2.o: debug.par
														
 
															+pass2.o: sim.h
														
 
															+pass2.o: text.h
														
 
															+pass2.o: lex.h
														
 
															+pass2.o: token.h
														
 
															+pass2.o: pass2.h
														
 
															+pass2.o: sortlist.bdy
														
 
															+pass3.o: system.par
														
 
															+pass3.o: debug.par
														
 
															+pass3.o: sim.h
														
 
															+pass3.o: runs.h
														
 
															+pass3.o: aiso.spc
														
 
															+pass3.o: error.h
														
 
															+pass3.o: options.h
														
 
															+pass3.o: pass3.h
														
 
															+pass3.o: percentages.h
														
 
															+pass3.o: tokenarray.h
														
 
															+pass3.o: token.h
														
 
															+algollike.o: options.h
														
 
															+algollike.o: token.h
														
 
															+algollike.o: algollike.h
														
 
															+algollike.o: language.h
														
 
															+clang.o: options.h
														
 
															+clang.o: algollike.h
														
 
															+clang.o: language.h
														
 
															+clang.o: token.h
														
 
															+clang.o: idf.h
														
 
															+clang.o: lex.h
														
 
															+clang.o: lang.h
														
 
															+javalang.o: options.h
														
 
															+javalang.o: algollike.h
														
 
															+javalang.o: language.h
														
 
															+javalang.o: token.h
														
 
															+javalang.o: idf.h
														
 
															+javalang.o: lex.h
														
 
															+javalang.o: lang.h
														
 
															+pascallang.o: options.h
														
 
															+pascallang.o: algollike.h
														
 
															+pascallang.o: language.h
														
 
															+pascallang.o: token.h
														
 
															+pascallang.o: idf.h
														
 
															+pascallang.o: lex.h
														
 
															+pascallang.o: lang.h
														
 
															+m2lang.o: options.h
														
 
															+m2lang.o: algollike.h
														
 
															+m2lang.o: language.h
														
 
															+m2lang.o: token.h
														
 
															+m2lang.o: idf.h
														
 
															+m2lang.o: lex.h
														
 
															+m2lang.o: lang.h
														
 
															+lisplang.o: language.h
														
 
															+lisplang.o: token.h
														
 
															+lisplang.o: lex.h
														
 
															+lisplang.o: lang.h
														
 
															+lisplang.o: idf.h
														
 
															+miralang.o: language.h
														
 
															+miralang.o: token.h
														
 
															+miralang.o: lex.h
														
 
															+miralang.o: lang.h
														
 
															+miralang.o: idf.h
														
 
															+textlang.o: language.h
														
 
															+textlang.o: token.h
														
 
															+textlang.o: idf.h
														
 
															+textlang.o: lex.h
														
 
															+textlang.o: lang.h
														
--- a/utils/sim_pasc/READ.ME
+++ b/utils/sim_pasc/READ.ME
@@ -0,0 +1,34 @@
 
															+#	This file is part of the software similarity tester SIM.
														
 
															+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+#	$Id: READ.ME,v 2.8 2005/02/20 17:02:58 dick Exp $
														
 
															+
														
 
															+These programs test for similar (or equal) stretches in one or more program
														
 
															+files and can be used to detect common code or plagiarism. See SIM.DOC.
														
 
															+Checkers are available for C, Java, Pascal, Modula-2, Lisp, Miranda and
														
 
															+natural text.
														
 
															+
														
 
															+This READ.ME file describes the MSDOS version. The UNIX version is described
														
 
															+in the file READ_ME.
														
 
															+
														
 
															+The archive SIM_2_21.ZIP contains:
														
 
															+	READ.ME			this READ.ME file
														
 
															+	SIM.TXT			a 2-page manual, UNIX-style
														
 
															+	SIM_C.EXE		similarity tester for C
														
 
															+	SIM_JAVA.EXE		similarity tester for Java
														
 
															+	SIM_PASC.EXE		similarity tester for Pascal
														
 
															+	SIM_M2.EXE		similarity tester for Modula-2
														
 
															+	SIM_LISP.EXE		similarity tester for Lisp
														
 
															+	SIM_MIRA.EXE		similarity tester for Miranda
														
 
															+	SIM_TEXT.EXE		similarity tester for text
														
 
															+
														
 
															+The MSDOS version does not contain sources. The sources are available from
														
 
															+the UNIX archive sim_2_21.shar, but require a C compiler, flex and make.
														
 
															+
														
 
															+					Dick Grune
														
 
															+					Vrije Universiteit
														
 
															+					de Boelelaan 1081
														
 
															+					1081 HV  Amsterdam
														
 
															+					the Netherlands
														
 
															+					email: [email protected]
														
 
															+					ftp://ftp.cs.vu.nl/pub/dick
														
 
															+					http://www.cs.vu.nl/~dick
														
--- a/utils/sim_pasc/README.1st
+++ b/utils/sim_pasc/README.1st
@@ -0,0 +1,68 @@
 
															+This is SIM, Software and text similarity tester, most recent revision
														
 
															+                                                               (2.19, 20050220)
														
 
															+by Dick Grune, Vrije Universiteit, Amsterdam, the Netherlands ([email protected]).
														
 
															+
														
 
															+SIM tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp,
														
 
															+Miranda and natural language. It can be used
														
 
															+
														
 
															+- to detect potentially duplicated code fragments in large software projects,
														
 
															+- to detect plagiarism in software and text-based projects, educational and
														
 
															+  otherwise.
														
 
															+
														
 
															+The program is fast:
														
 
															+the UNIX version on a Sun ULTRA does about 50000 tokens/sec,
														
 
															+the DOS version on a Pentium 166 does about 25000 tokens/sec.
														
 
															+
														
 
															+SIM is available for UNIX (in source code) and MSDOS (32-bit executables).
														
 
															+
														
 
															+UNIX:
														
 
															+	To obtain the files, do:
														
 
															+		sh sim_2_21.shar
														
 
															+	This unpacks the sources, the Makefile, sim.1 and READ_ME.
														
 
															+	For installation notes and other info then see READ_ME.
														
 
															+
														
 
															+MSDOS:
														
 
															+	To obtain the files, do:
														
 
															+		[pk]unzip SIM_2_21.zip
														
 
															+	This unpacks the executables, SIM.DOC and READ.ME.
														
 
															+	For other info then see READ.ME.
														
 
															+
														
 
															+Changes from Release 2.19:
														
 
															+	Various changes necessitated by Linux flex being different
														
 
															+
														
 
															+Changes from Release 2.16:
														
 
															+	Various updates and adjustments in the code and the installation
														
 
															+	procedure.
														
 
															+
														
 
															+Changes from Release 2.13:
														
 
															+	Percentage reporting feature added.
														
 
															+
														
 
															+Changes from Release 2.12:
														
 
															+	Miranda checker added.
														
 
															+
														
 
															+Changes from Release 2.9:
														
 
															+	Java checker added.
														
 
															+	The C checker 'sim' was renamed to 'sim_c', for uniformity.
														
 
															+	Converted the sources to ANSI C.
														
 
															+	All versions now report non_ASCI characters in the input.
														
 
															+
														
 
															+Changes from Release 2.8:
														
 
															+	DOS versions can now compare very large files (>400000 tokens)
														
 
															+
														
 
															+Changes from Release 1.21, as posted in comp.sources.unix (1987):
														
 
															+	Ported to MSDOS
														
 
															+	Significant speed improvements
														
 
															+	New options: -e, -S and / , to compare files group-wise
														
 
															+	New option: -F , to require function names to match exactly
														
 
															+	Lisp version added
														
 
															+	Miscellaneous improvements
														
 
															+
														
 
															+
														
 
															+					Dick Grune
														
 
															+					Vrije Universiteit
														
 
															+					de Boelelaan 1081
														
 
															+					1081 HV  Amsterdam
														
 
															+					the Netherlands
														
 
															+					email: [email protected]
														
 
															+					ftp://ftp.cs.vu.nl/pub/dick
														
 
															+					http://www.cs.vu.nl/~dick
														
--- a/utils/sim_pasc/READ_ME
+++ b/utils/sim_pasc/READ_ME
@@ -0,0 +1,52 @@
 
															+#	This file is part of the software similarity tester SIM.
														
 
															+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+#	$Id: READ_ME,v 2.6 2005/02/20 17:02:59 dick Exp $
														
 
															+
														
 
															+These programs test for similar (or equal) stretches in one or more program
														
 
															+files and can be used to detect common code or plagiarism. See sim.1.
														
 
															+Checkers are available for C, Java, Pascal, Modula-2, Lisp, Miranda and
														
 
															+natural text.
														
 
															+
														
 
															+This READ_ME file describes the UNIX version. The MSDOS version is described
														
 
															+in the file READ.ME.
														
 
															+
														
 
															+To obtain the sources, do
														
 
															+	sh sim_2_21.shar
														
 
															+
														
 
															+To compile and test, just call
														
 
															+	make
														
 
															+This will generate one executable called sim_c, the checker for C, and will
														
 
															+run two small tests to show sample output.
														
 
															+
														
 
															+To install, examine sysidf.mk, reset BINDIR and MANDIR to sensible paths,
														
 
															+and call
														
 
															+	make install.sim_c			for C
														
 
															+	make install.sim_java			for Java
														
 
															+	make install.sim_pasc			for Pascal
														
 
															+	make install.sim_m2			for Modula-2
														
 
															+	make install.sim_lisp			for Lisp
														
 
															+	make install.sim_mira			for Miranda
														
 
															+	make install.sim_text			for text
														
 
															+or
														
 
															+	make install.all			for everything.
														
 
															+These will also install the manual page.
														
 
															+
														
 
															+To change the default run size or the page width, adjust the file params.h
														
 
															+and recompile.
														
 
															+
														
 
															+To add another language L, write a file Llang.l along the lines of clang.l
														
 
															+and the other *lang.l files, extend the Makefile and recompile.
														
 
															+All knowledge about a given language L is located in Llang.l; the rest of
														
 
															+the programs expect each token to be a single character.
														
 
															+
														
 
															+Available at present:
														
 
															+	clang.l javalang.l pascallang.l m2lang.l lisplang.l miralang.l text.l
														
 
															+
														
 
															+					Dick Grune
														
 
															+					Vrije Universiteit
														
 
															+					de Boelelaan 1081
														
 
															+					1081 HV  Amsterdam
														
 
															+					the Netherlands
														
 
															+					email: [email protected]
														
 
															+					ftp://ftp.cs.vu.nl/pub/dick
														
 
															+					http://www.cs.vu.nl/~dick
														
--- a/utils/sim_pasc/TechnReport
+++ b/utils/sim_pasc/TechnReport
@@ -0,0 +1,214 @@
 
															+		CONCISE REPORT ON THE ALGORITHMS IN SIM			970623
														
 
															+
														
 
															+
														
 
															+
														
 
															+	INTRODUCTION
														
 
															+
														
 
															+The general outline of the similarity checker is as follows:
														
 
															+
														
 
															+	1. the files are read in (pass 1)
														
 
															+	2. a forward-reference table is prepared
														
 
															+	3. the set of interesting runs is determined
														
 
															+	4. the line numbers of the runs are determined (pass 2)
														
 
															+	5. the contents of the runs are printed in order (pass 3)
														
 
															+
														
 
															+To keep the memory requirements (relatively) small, the exact positions
														
 
															+of the tokens are not recorded.  This necessitates pass 2.  See, however,
														
 
															+the pertinent chapter.
														
 
															+
														
 
															+
														
 
															+	READING THE FILES
														
 
															+
														
 
															+Each file is tokenized using an lex-generated scanner appropriate for
														
 
															+the input.  Each token fits in one byte, possibly using all 8 bits.  The
														
 
															+tokens are stored in the array TokenArray[], which is extended by
														
 
															+reallocation if it overflows.  See tokenarray.c.
														
 
															+
														
 
															+Also, to optimize away pass 2, an attempt is made to remember the token
														
 
															+positions of all beginnings of lines.  The token-positions at BOL are
														
 
															+stored in the array nl_buff[], which is also extended by reallocation,
														
 
															+if needed.  If the attempt fails due to lack of memory, nl_buff[] is
														
 
															+abandoned, and pass2 will read the files instead.
														
 
															+
														
 
															+
														
 
															+	PREPARING THE FORWARD-REFERENCE TABLE
														
 
															+
														
 
															+Text is compared by comparing every substring to all substrings
														
 
															+to the right of it; this process is in essence quadratic.  However,
														
 
															+only substrings of length at least 'MinRunSize' are of interest,
														
 
															+which gives us the possibility to speed up this process by using
														
 
															+a hash table.
														
 
															+
														
 
															+Once the entire text has been read in, a forward-reference table
														
 
															+forward_references[] is made (see hash.c).
														
 
															+For every position in the text, we construct an index which gives
														
 
															+the next position in the text where a run of MinRunSize tokens
														
 
															+starts that has the same hash code.  If there is no such run, the
														
 
															+index is 0.
														
 
															+
														
 
															+To fill in this array, we use a hash table last_index[], such that
														
 
															+last_index[i] is the index of the latest token with hash_code i, or 0 if
														
 
															+there is none.  If at a given position p, we find that the text ahead of
														
 
															+us has hash code i, last_index[i] tells us which position in
														
 
															+forward_references[] will have to be updated to p.
														
 
															+See MakeForwardReferences().
														
 
															+
														
 
															+For long text sequences (say hundreds of thousands of tokens), the
														
 
															+hashing is not really efficient any more since too many spurious matches
														
 
															+occur.  Therefore, the forward reference table is scanned a second time,
														
 
															+eliminating from any chain all references to runs that do not start with
														
 
															+and end in the same token (actually this is a second hash code).
														
 
															+For the UNIX manuals this reduced the number of matches from 91.9% to 1.9%
														
 
															+(of which 0.06% was genuine).
														
 
															+
														
 
															+	DETERMINING THE SET OF INTERESTING RUNS
														
 
															+
														
 
															+The overall structure of the routine Compare() (see compare.c) is:
														
 
															+
														
 
															+for all new files
														
 
															+	for all texts it must be compared to
														
 
															+		for all positions in the new file
														
 
															+			for all positions in the text
														
 
															+				for ever increasing sizes
														
 
															+					try to match and keep the best
														
 
															+
														
 
															+If for a given position in the new file a good run (i.e. on of at least
														
 
															+minimum length) has been found, the run is registered using a call of
														
 
															+add_run(), the run is skipped in the new file and searching continues at
														
 
															+the position after it.  This prevents duplicate reports of runs.
														
 
															+
														
 
															+Add_run() allocates a struct run for the run (see sim.h)
														
 
															+which contains two struct chunks and a quality description.  It fills
														
 
															+in the two chunks with the pertinent info, one for the first file and
														
 
															+one for the second (which may be the same, if the run relates two chunks
														
 
															+in the same file).
														
 
															+
														
 
															+The run is then entered into the arbitrary-in-sorted-out store AISO (see
														
 
															+aiso.spc and aiso.bdy, a genuine generic abstract data type in C!), in
														
 
															+which it is inserted according to its quality.  Both positions
														
 
															+(struct position) in both chunks in the run (so four in total) are each
														
 
															+entered in a linked list starting at the tx_pos field in the struct text
														
 
															+of the appropriate file.
														
 
															+
														
 
															+When this is finished, the forward reference table can be deleted.
														
 
															+
														
 
															+So the final results of this phase are visible both through the tx_pos
														
 
															+fields and through the aiso interface.
														
 
															+
														
 
															+
														
 
															+	DETERMINING THE EXACT POSITION OF EACH RUN (PASS 2)
														
 
															+
														
 
															+The purpose of this pass is to find for each chunk, which up to now is
														
 
															+known by token position only, its starting and ending line number (which
														
 
															+cannot be easily derived from the token position).
														
 
															+
														
 
															+For each file that has a non-zero tx_pos field, ie. that has some
														
 
															+interesting chunks, the positions in the tx_pos list are sorted on
														
 
															+ascending line number (they have been found in essentially arbitrary
														
 
															+order) by sort_pos() in pass2.c.
														
 
															+
														
 
															+Next we scan the pos list and the file in parallel, updating the info in
														
 
															+a position when we meet it.  A position carries an indication whether it
														
 
															+is a starting or an ending position, since slightly differing
														
 
															+calculations have to be done in each case.
														
 
															+
														
 
															+Actually, if the nl_buff[] data structure still exists, the file is not
														
 
															+accessed at all and the data from nl_buff[] is used instead.  This is
														
 
															+done transparently in buff.c.
														
 
															+
														
 
															+
														
 
															+	PRINTING THE CONTENTS OF THE RUNS (PASS 3)
														
 
															+
														
 
															+Since each struct run has now been completely filled in, this is simple;
														
 
															+the hard work is calculating the page layout.
														
 
															+Pass3() accesses the aiso store and retrieves from it the runs in
														
 
															+descending order of importance.  Show_run() opens both files, positions
														
 
															+them using the line numbers and prints the runs.
														
 
															+
														
 
															+================================================================
														
 
															+	CODE EXCERPT OF THE SOFTWARE SIMILARITY TESTER SIM (980222)
														
 
															+
														
 
															+sim:
														
 
															+	get command line options
														
 
															+	check the options
														
 
															+
														
 
															+	init language, to precompute tables
														
 
															+
														
 
															+	pass1, read the files
														
 
															+		# there is an array TokenArray[] that holds all input tokens
														
 
															+
														
 
															+	make forward reference table
														
 
															+		# there is an array forward_references[], with one entry for
														
 
															+		#   each token in the input; forward_references[i] gives the
														
 
															+		#   token number where a token sequence starts with the same
														
 
															+		#   hash value as the one starting at i
														
 
															+
														
 
															+	compare various files to find runs
														
 
															+	delete forward reference table
														
 
															+	pass2, find newline positions of found similarities
														
 
															+	pass3, print the similarities
														
 
															+
														
 
															+
														
 
															+
														
 
															+pass1, read the files:
														
 
															+	for each file
														
 
															+		divide the text into tokens
														
 
															+		store all tokens except newlines in TokenArray and try to
														
 
															+			keep a record of the newline positions
														
 
															+
														
 
															+
														
 
															+
														
 
															+make forward reference table:
														
 
															+	# there are two independent hash functions, hash1() and hash2().
														
 
															+	#   hash1(i) gives the hash value of the token sequence starting at i
														
 
															+	#   likewise for hash2(i)
														
 
															+
														
 
															+	set up the forward references using the last_index table:
														
 
															+		# there is an array last_index[], with one entry for each
														
 
															+		#   possible hash value; last_index[i] gives the position in
														
 
															+		#   forward_references[] at which i was most recently
														
 
															+		#   encountered as a hash value
														
 
															+		for each file
														
 
															+			for all positions in file except the last MinRunSize
														
 
															+				set forward_references[] and update last_index[]
														
 
															+
														
 
															+	use hash2() to clean out matches:
														
 
															+		for all tokens
														
 
															+			find first token in chain with same hash2 code
														
 
															+			short-circuit forward reference to it
														
 
															+
														
 
															+
														
 
															+
														
 
															+compare:
														
 
															+	for all new files
														
 
															+		for all texts it must be compared to
														
 
															+			for all positions in the new file
														
 
															+				for all positions in the text
														
 
															+					for ever increasing sizes
														
 
															+						try to match and keep the best
														
 
															+	try to match and keep the best:
														
 
															+		# using forward_references[], we find a list of positions in
														
 
															+		#   which a matching token sequence will start;
														
 
															+		#   scanning this list, we measure the maximum length of the
														
 
															+		#   match and add the longest match to the run collection
														
 
															+
														
 
															+
														
 
															+
														
 
															+pass2, find positions of found runs:
														
 
															+	for all files:
														
 
															+		sort the positions in the runs
														
 
															+
														
 
															+		# we scan the pos list and the file in parallel
														
 
															+		for all positions inside this file
														
 
															+			if it matches a token position in a run
														
 
															+				record line number
														
 
															+
														
 
															+
														
 
															+
														
 
															+pass3, print the similarities:
														
 
															+	for all runs
														
 
															+		# a run consists of two chunks
														
 
															+		open the files that hold the chunks and position them
														
 
															+		  at the beginning of the chunk
														
 
															+		display the chunks
														
 
															+
														
--- a/utils/sim_pasc/add_run.c
+++ b/utils/sim_pasc/add_run.c
@@ -0,0 +1,70 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: add_run.c,v 2.5 2001/11/08 12:30:28 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<malloc.h>
														
 
															+
														
 
															+#include	"sim.h"
														
 
															+#include	"runs.h"
														
 
															+#include	"percentages.h"
														
 
															+#include	"options.h"
														
 
															+#include	"error.h"
														
 
															+#include	"add_run.h"
														
 
															+
														
 
															+static void set_chunk(
														
 
															+	struct chunk *,
														
 
															+	struct text *,
														
 
															+	unsigned int,
														
 
															+	unsigned int
														
 
															+);
														
 
															+
														
 
															+static void set_pos(
														
 
															+	struct position *,
														
 
															+	int,
														
 
															+	struct text *,
														
 
															+	unsigned int
														
 
															+);
														
 
															+
														
 
															+void
														
 
															+add_run(struct text *txt0, unsigned int i0,
														
 
															+	struct text *txt1, unsigned int i1,
														
 
															+	unsigned int size
														
 
															+) {
														
 
															+	/*	Adds the run of given size to our collection.
														
 
															+	*/
														
 
															+	register struct run *r = (struct run *)malloc(sizeof (struct run));
														
 
															+
														
 
															+	if (!r) fatal("out of memory");
														
 
															+	set_chunk(&r->rn_cn0, txt0, i0 - txt0->tx_start, size);
														
 
															+	set_chunk(&r->rn_cn1, txt1, i1 - txt1->tx_start, size);
														
 
															+	r->rn_size = size;
														
 
															+
														
 
															+	if (option_set('p') ? add_to_percentages(r) : add_to_runs(r)) {
														
 
															+		/* OK */
														
 
															+	}
														
 
															+	else	fatal("out of memory");
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+set_chunk(struct chunk *cnk, struct text *txt,
														
 
															+	  unsigned int start, unsigned int size
														
 
															+) {
														
 
															+	/*	Fill the chunk *cnk with info about the piece of text
														
 
															+		in txt starting at start extending over size tokens.
														
 
															+	*/
														
 
															+	cnk->ch_text = txt;
														
 
															+	set_pos(&cnk->ch_first, 0, txt, start);
														
 
															+	set_pos(&cnk->ch_last, 1, txt, start + size - 1);
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+set_pos(struct position *pos, int type, struct text *txt, unsigned int start) {
														
 
															+	/* Fill a single struct position */
														
 
															+	pos->ps_next = txt->tx_pos;
														
 
															+	txt->tx_pos = pos;
														
 
															+
														
 
															+	pos->ps_type = type;
														
 
															+	pos->ps_tk_cnt = start;
														
 
															+	pos->ps_nl_cnt = -1;		/* uninitialized */
														
 
															+}
														
--- a/utils/sim_pasc/add_run.h
+++ b/utils/sim_pasc/add_run.h
@@ -0,0 +1,19 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: add_run.h,v 1.1 2001/09/28 09:03:39 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Interface between front-end and back-end: all information about
														
 
															+	runs passes through add_run().  Its parameters are the two chunks,
														
 
															+	each identified by their struct text and the position of the common
														
 
															+	segment in TokenArray[], and the number of tokens in the common
														
 
															+	segment.
														
 
															+*/
														
 
															+
														
 
															+void add_run(
														
 
															+	struct text *txt0,		/* text of first chunk */
														
 
															+	unsigned int i0,		/* chunk position in TokenArray[] */
														
 
															+	struct text *txt1,		/* text of second chunk */
														
 
															+	unsigned int i1,		/* chunk position in TokenArray[] */
														
 
															+	unsigned int size		/* number of tokens in the chunk */
														
 
															+);
														
--- a/utils/sim_pasc/aiso.bdy
+++ b/utils/sim_pasc/aiso.bdy
@@ -0,0 +1,186 @@
 
															+/*
														
 
															+	Module:	Arbitrary-In Sorted-Out (AISO)
														
 
															+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
														
 
															+
														
 
															+Description:
														
 
															+	This is the body of a module that builds an arbitrary-in
														
 
															+	sorted-out data structure, to be used as a heap, a priority queue, etc.
														
 
															+	See aiso.spc for further info.
														
 
															+*/
														
 
															+
														
 
															+#include	<malloc.h>
														
 
															+
														
 
															+static struct aiso_node *root;		/* root of tree */
														
 
															+#ifdef	AISO_ITERATOR
														
 
															+static struct aiso_node *list;		/* start of linked list */
														
 
															+#endif	/* AISO_ITERATOR */
														
 
															+
														
 
															+/* the policy */
														
 
															+static int aiso_size = 0;
														
 
															+static int access_mark = 1;
														
 
															+
														
 
															+#define	add_entry()	(aiso_size++)
														
 
															+#define	remove_entry()	(aiso_size--)
														
 
															+#define	reset_access()	(access_mark = 1)
														
 
															+#define	count_access()	(access_mark <<= 1)
														
 
															+#define	must_rotate()	(access_mark > aiso_size)
														
 
															+
														
 
															+int
														
 
															+InsertAiso(AISO_TYPE v) {
														
 
															+	register struct aiso_node *new_node;
														
 
															+	register struct aiso_node **hook = &root;
														
 
															+#ifdef	AISO_ITERATOR
														
 
															+	register struct aiso_node **prev = &list;
														
 
															+#endif	/* AISO_ITERATOR */
														
 
															+
														
 
															+	new_node = (struct aiso_node *)malloc(sizeof (struct aiso_node));
														
 
															+	if (!new_node) {
														
 
															+		/* avoid modifying the tree */
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	while (*hook) {
														
 
															+		register struct aiso_node *an = *hook;
														
 
															+
														
 
															+		count_access();
														
 
															+		if (AISO_BEFORE(v, an->an_value)) {
														
 
															+			/* head left */
														
 
															+			if (!an->an_left || !must_rotate()) {
														
 
															+				/* standard action */
														
 
															+				hook = &an->an_left;
														
 
															+			}
														
 
															+			else {
														
 
															+				/* change (l A r) B (C) into (l) A (r B C) */
														
 
															+				register struct aiso_node *anl = an->an_left;
														
 
															+
														
 
															+				an->an_left = anl->an_right;
														
 
															+				anl->an_right = an;
														
 
															+				*hook = anl;
														
 
															+				reset_access();
														
 
															+			}
														
 
															+		}
														
 
															+		else {
														
 
															+			/* head right */
														
 
															+			if (!an->an_right || !must_rotate()) {
														
 
															+				/* standard action */
														
 
															+				hook = &an->an_right;
														
 
															+			}
														
 
															+			else {
														
 
															+				/* change (A) B (l C r) into (A B l) C (r) */
														
 
															+				register struct aiso_node *anr = an->an_right;
														
 
															+
														
 
															+				an->an_right = anr->an_left;
														
 
															+				anr->an_left = an;
														
 
															+				*hook = anr;
														
 
															+				reset_access();
														
 
															+			}
														
 
															+#ifdef	AISO_ITERATOR
														
 
															+			prev = &an->an_next;
														
 
															+#endif	/* AISO_ITERATOR */
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	new_node->an_left = 0;
														
 
															+	new_node->an_right = 0;
														
 
															+#ifdef	AISO_ITERATOR
														
 
															+	new_node->an_next = *prev;
														
 
															+	*prev = new_node;
														
 
															+#endif	/* AISO_ITERATOR */
														
 
															+	new_node->an_value = v;
														
 
															+	*hook = new_node;
														
 
															+	add_entry();
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+#ifdef	AISO_EXTRACTOR
														
 
															+
														
 
															+int
														
 
															+ExtractAiso(AISO_TYPE *vp) {
														
 
															+	register struct aiso_node **hook = &root;
														
 
															+	register struct aiso_node *an;
														
 
															+
														
 
															+	if (!root) return 0;
														
 
															+
														
 
															+	while ((an = *hook), an->an_left) {
														
 
															+		/* head left */
														
 
															+		count_access();
														
 
															+		if (!must_rotate()) {
														
 
															+			/* standard action */
														
 
															+			hook = &an->an_left;
														
 
															+		}
														
 
															+		else {
														
 
															+			/* change (l A r) B (C) into (l) A (r B C) */
														
 
															+			register struct aiso_node *anl = an->an_left;
														
 
															+
														
 
															+			an->an_left = anl->an_right;
														
 
															+			anl->an_right = an;
														
 
															+			*hook = anl;
														
 
															+			reset_access();
														
 
															+		}
														
 
															+	}
														
 
															+	/* found the first */
														
 
															+	*vp = an->an_value;
														
 
															+	*hook = an->an_right;
														
 
															+#ifdef	AISO_ITERATOR
														
 
															+	list = an->an_next;
														
 
															+#endif	/* AISO_ITERATOR */
														
 
															+	free((char *)an);
														
 
															+	remove_entry();
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+#endif	/* AISO_EXTRACTOR */
														
 
															+
														
 
															+#ifdef	AISO_ITERATOR
														
 
															+
														
 
															+void
														
 
															+OpenIter(AisoIter *ip) {
														
 
															+	*ip = list;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+GetAisoItem(AisoIter *ip, AISO_TYPE *vp) {
														
 
															+	register struct aiso_node *an = *ip;
														
 
															+
														
 
															+	if (!an) return 0;
														
 
															+
														
 
															+	*vp = an->an_value;
														
 
															+	*ip = an->an_next;
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+void
														
 
															+CloseIter(AisoIter *ip) {
														
 
															+	*ip = 0;
														
 
															+}
														
 
															+
														
 
															+#endif	/* AISO_ITERATOR */
														
 
															+
														
 
															+#ifdef	AISO_DEBUG
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+
														
 
															+static void
														
 
															+print_inf(int level, char ch, struct aiso_node *an) {
														
 
															+	register int i;
														
 
															+
														
 
															+	if (!an) return;
														
 
															+
														
 
															+	print_inf(level+1, '/', an->an_right);
														
 
															+	for (i = 0; i < level; i++) {
														
 
															+		printf("     ");
														
 
															+	}
														
 
															+	printf("%c", ch);
														
 
															+	printf(AISO_FORMAT, an->an_value);
														
 
															+	printf("\n");
														
 
															+	print_inf(level+1, '\\', an->an_left);
														
 
															+}
														
 
															+
														
 
															+void
														
 
															+PrintAisoTree(void)
														
 
															+{
														
 
															+	print_inf(0, '-', root);
														
 
															+	printf("================\n");
														
 
															+}
														
 
															+
														
 
															+#endif	/* AISO_DEBUG */
														
--- a/utils/sim_pasc/aiso.spc
+++ b/utils/sim_pasc/aiso.spc
@@ -0,0 +1,102 @@
 
															+/*
														
 
															+	Module:	Arbitrary-In Sorted-Out (AISO)
														
 
															+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
														
 
															+	Version:	Tue Aug 23 12:54:22 1988
														
 
															+
														
 
															+Description:
														
 
															+	This is the specification of a generic module that builds an
														
 
															+	arbitrary-in sorted-out data structure, to be used as a heap, a
														
 
															+	priority queue, etc. Elements can be inserted, the first element
														
 
															+	extracted and the set scanned at any moment.
														
 
															+
														
 
															+Instantiation:
														
 
															+	The module is instantiated as follows.
														
 
															+	Create a file M.h for some M, which contains at least:
														
 
															+	-	a definition of AISO_TYPE, the type of the object to be stored
														
 
															+	-	a possible definition of AISO_EXTRACTOR; see below
														
 
															+	-	a possible definition of AISO_ITERATOR; see below
														
 
															+	-	#include	"aiso.spc"
														
 
															+
														
 
															+	This file M.h is to be included in all files that use the aiso
														
 
															+	package.
														
 
															+
														
 
															+	Create a file M.c which contains at least:
														
 
															+	-	#include	"M.h"
														
 
															+	-	a definition of a routine
														
 
															+			int AISO_BEFORE(AISO_TYPE v, AISO_TYPE w)
														
 
															+		which yields non-zero if v is to be sorted before w
														
 
															+	-	#include	"aiso.bdy"
														
 
															+
														
 
															+	This file compiles into the module object.
														
 
															+
														
 
															+Specification:
														
 
															+	The module always supplies:
														
 
															+	int InsertAiso(AISO_TYPE value)
														
 
															+		inserts value in its proper place; fails if out of memory
														
 
															+
														
 
															+	If AISO_EXTRACTOR is defined, the module will also supply:
														
 
															+	int ExtractAiso(AISO_TYPE *value)
														
 
															+		yields the first value in the aiso and removes it;
														
 
															+		fails if empty
														
 
															+
														
 
															+	If AISO_ITERATOR is defined, the module also supplies a type AisoIter
														
 
															+	which declares an iterator, i.e., a structure that records a position
														
 
															+	in the ordered set, plus routines for manipulating the iterator, thus
														
 
															+	enabling the user to scan the ordered set.  The iterator should be
														
 
															+	declared as:
														
 
															+		AisoIter iter;
														
 
															+	and is manipulated by the following commands:
														
 
															+
														
 
															+	void OpenIter(AisoIter *iter)
														
 
															+		opens the iterator for scanning the existing set in order
														
 
															+
														
 
															+	int GetAisoItem(AisoIter *iter, AISO_TYPE *value)
														
 
															+		yields the next value in the iterator; fails if exhausted
														
 
															+
														
 
															+	void CloseIter(AisoIter *iter)
														
 
															+		closes the iterator
														
 
															+
														
 
															+	If AISO_DEBUG is defined the module will also supply:
														
 
															+	void PrintAisoTree(void)
														
 
															+		prints the AISO tree; requires AISO_FORMAT, to be set to
														
 
															+		a format suitable to print a value of type AISO_TYPE
														
 
															+
														
 
															+Implementation:
														
 
															+	The AISO implementation is based on a self-adjusting binary tree.
														
 
															+	Degenerate behaviour of the tree is avoided by shaking the tree
														
 
															+	every 'ln aiso_size' node accesses.  This guarantees ln aiso_size
														
 
															+	behaviour in the long run, though it is possible for a single
														
 
															+	operation to take aiso_size node accesses.
														
 
															+
														
 
															+	The iterator is implemented as an additional linear linked list
														
 
															+	through the tree.  This is simpler than and at least as efficient as
														
 
															+	clever tree-wiring.
														
 
															+
														
 
															+Restrictions:
														
 
															+	Due to built-in fixed names, there can only be one AISO per program.
														
 
															+*/
														
 
															+
														
 
															+struct aiso_node {
														
 
															+	struct aiso_node *an_left;
														
 
															+	struct aiso_node *an_right;
														
 
															+#ifdef	AISO_ITERATOR
														
 
															+	struct aiso_node *an_next;
														
 
															+#endif	/* AISO_ITERATOR */
														
 
															+	AISO_TYPE an_value;
														
 
															+};
														
 
															+
														
 
															+extern int InsertAiso(AISO_TYPE value);
														
 
															+#ifdef	AISO_EXTRACTOR
														
 
															+extern int ExtractAiso(AISO_TYPE *value);
														
 
															+#endif	/* AISO_EXTRACTOR */
														
 
															+
														
 
															+#ifdef	AISO_ITERATOR
														
 
															+typedef	struct aiso_node *AisoIter;
														
 
															+extern void OpenIter(AisoIter *iter);
														
 
															+extern int GetAisoItem(AisoIter *iter, AISO_TYPE *value);
														
 
															+extern void CloseIter(AisoIter *iter);
														
 
															+#endif	/* AISO_ITERATOR */
														
 
															+
														
 
															+#ifdef	AISO_DEBUG
														
 
															+extern void PrintAisoTree(void);
														
 
															+#endif	/* AISO_ITERATOR */
														
--- a/utils/sim_pasc/algollike.c
+++ b/utils/sim_pasc/algollike.c
@@ -0,0 +1,135 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: algollike.c,v 2.4 2005/02/20 17:02:59 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	This module implements the routines InitLanguage, MayBeStartOfRun
														
 
															+	and CheckRun for ALGOL-like languages, in which it is meaningful
														
 
															+	and useful to isolate function bodies.
														
 
															+
														
 
															+	It requires the user to define, preferably in Xlang.l, four token
														
 
															+	sets, represented as TOKEN[] and terminated by NOTOKEN:
														
 
															+
														
 
															+	TOKEN NonFinals[]	tokens that may not end a chunk
														
 
															+	TOKEN NonInitials[]	tokens that may not start a chunk
														
 
															+	TOKEN Openers[]		openers of parentheses that must balance
														
 
															+					in functions
														
 
															+	TOKEN Closers[]		the corresponding closers, in the same order
														
 
															+*/
														
 
															+
														
 
															+#include	"options.h"
														
 
															+#include	"token.h"
														
 
															+#include	"algollike.h"
														
 
															+
														
 
															+/*	Arrays for fast identification tests for tokens.  Each token is
														
 
															+	identified by its position in the set + 1.  For example, if T is
														
 
															+	the n-th Opener, openers[TOKEN2int(tk)] == n+1.
														
 
															+*/
														
 
															+static char non_finals[256];
														
 
															+static char non_initials[256];
														
 
															+static char openers[256];
														
 
															+static char closers[256];
														
 
															+
														
 
															+static void cvt2bittable(const TOKEN *tl, char bt[256]);
														
 
															+static unsigned int largest_function(const TOKEN *str, unsigned int size);
														
 
															+
														
 
															+void
														
 
															+InitLanguage(void) {
														
 
															+	/* convert the token sets to bitmaps */
														
 
															+	cvt2bittable(NonFinals, non_finals);
														
 
															+	cvt2bittable(NonInitials, non_initials);
														
 
															+	cvt2bittable(Openers, openers);
														
 
															+	cvt2bittable(Closers, closers);
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+cvt2bittable(const TOKEN *tl, char bt[256]) {
														
 
															+	int i;
														
 
															+	int cnt = 1;
														
 
															+
														
 
															+	for (i = 0; !TOKEN_EQ(tl[i], NOTOKEN); i++) {
														
 
															+		bt[TOKEN2int(tl[i])] = cnt++;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+MayBeStartOfRun(TOKEN tk) {
														
 
															+	return !non_initials[TOKEN2int(tk)];
														
 
															+}
														
 
															+
														
 
															+unsigned int
														
 
															+CheckRun(const TOKEN *str, unsigned int size) {
														
 
															+	/*	Checks the run starting at str with length size for
														
 
															+		acceptability in the language.  Cuts from the end if
														
 
															+		necessary and returns the accepted length, which may
														
 
															+		be zero.
														
 
															+	*/
														
 
															+
														
 
															+	if (option_set('f')) {
														
 
															+		/* reduce to a function-like form first */
														
 
															+		size = largest_function(str, size);
														
 
															+	}
														
 
															+
														
 
															+	while (	/* there is trailing garbage */
														
 
															+		size != 0 && non_finals[TOKEN2int(str[size-1])]
														
 
															+	) {
														
 
															+		/* remove it */
														
 
															+		size--;
														
 
															+	}
														
 
															+
														
 
															+	return size;
														
 
															+}
														
 
															+
														
 
															+static unsigned int
														
 
															+largest_function(const TOKEN *str, unsigned int size) {
														
 
															+	/*	Returns the size of the longest sequence starting at
														
 
															+		str[0] and not containing unbalanced parentheses.
														
 
															+		Does not check the nesting of the parentheses, but then,
														
 
															+		sim is syntax-free anyway.
														
 
															+	*/
														
 
															+	register unsigned int mrb_size = 0;  /* most recent balancing size */
														
 
															+	register unsigned int pos;
														
 
															+	register int i;
														
 
															+	int balance_count[256];
														
 
															+	int n_imbalances;
														
 
															+
														
 
															+	/* clear administration */
														
 
															+	n_imbalances = 0;
														
 
															+	for (i = 0; i < 255; i++) {
														
 
															+		balance_count[i] = 0;
														
 
															+	}
														
 
															+
														
 
															+	/* scan str[] and see how far we get */
														
 
															+	for (pos = 0; pos < size; pos++) {
														
 
															+		register int tkval = TOKEN2int(str[pos]);
														
 
															+		register int pp;		/* parenthesis position */
														
 
															+
														
 
															+		/* account for openers */
														
 
															+		if ((pp = openers[tkval])) {
														
 
															+			if (balance_count[pp] == 0) {
														
 
															+				/* about to create an imbalance */
														
 
															+				n_imbalances++;
														
 
															+			}
														
 
															+			balance_count[pp]++;
														
 
															+		}
														
 
															+
														
 
															+		/* account for closers */
														
 
															+		if ((pp = closers[tkval])) {
														
 
															+			if (balance_count[pp] == 0) {
														
 
															+				/* this is one Closer too many */
														
 
															+				return mrb_size;
														
 
															+			}
														
 
															+			balance_count[pp]--;
														
 
															+			if (balance_count[pp] == 0) {
														
 
															+				/* we just cleared an imbalance */
														
 
															+				n_imbalances--;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		if (n_imbalances == 0) {
														
 
															+			/* register balance point */
														
 
															+			mrb_size = pos + 1;
														
 
															+		}
														
 
															+	}
														
 
															+	return mrb_size;
														
 
															+}
														
--- a/utils/sim_pasc/algollike.h
+++ b/utils/sim_pasc/algollike.h
@@ -0,0 +1,27 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: algollike.h,v 1.1 1997/06/20 12:03:11 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	The class Algollike is a subclass of Language.  It implements
														
 
															+	the routines InitLanguage, MayBeStartOfRun and CheckRun for
														
 
															+	ALGOL-like languages, in which it is meaningful and useful to
														
 
															+	isolate function bodies.
														
 
															+
														
 
															+	It requires the user to define, preferably in Xlang.l, four token
														
 
															+	sets, represented as TOKEN[] and terminated by NOTOKEN:
														
 
															+
														
 
															+	TOKEN NonFinals[]	tokens that may not end a chunk
														
 
															+	TOKEN NonInitials[]	tokens that may not start a chunk
														
 
															+	TOKEN Openers[]		openers of parentheses that must balance
														
 
															+					in functions
														
 
															+	TOKEN Closers[]		the corresponding closers, in the same order
														
 
															+*/
														
 
															+
														
 
															+#include	"language.h"
														
 
															+#include	"token.h"
														
 
															+
														
 
															+extern const TOKEN NonFinals[];
														
 
															+extern const TOKEN NonInitials[];
														
 
															+extern const TOKEN Openers[];
														
 
															+extern const TOKEN Closers[];
														
--- a/utils/sim_pasc/clang.l
+++ b/utils/sim_pasc/clang.l
@@ -0,0 +1,252 @@
 
															+%{
														
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: clang.l,v 2.9 2007/08/29 09:10:31 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	C language front end for the similarity tester.
														
 
															+	Author:	Dick Grune <[email protected]>
														
 
															+*/
														
 
															+
														
 
															+#include	"options.h"
														
 
															+#include	"algollike.h"
														
 
															+#include	"token.h"
														
 
															+#include	"idf.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"lang.h"
														
 
															+
														
 
															+/* Language-dependent Code */
														
 
															+
														
 
															+/* Data for module idf */
														
 
															+
														
 
															+static const struct idf ppcmd[] = {
														
 
															+	{"define",	META('d')},
														
 
															+	{"else",	META('e')},
														
 
															+	{"endif",	META('E')},
														
 
															+	{"if",		META('i')},
														
 
															+	{"ifdef",	META('I')},
														
 
															+	{"ifndef",	META('x')},
														
 
															+	{"include",	MTCT('I')},
														
 
															+	{"line",	META('l')},
														
 
															+	{"undef",	META('u')}
														
 
															+};
														
 
															+
														
 
															+static const struct idf reserved[] = {
														
 
															+	{"auto",	NORM('a')},
														
 
															+	{"break",	NORM('b')},
														
 
															+	{"case",	NORM('c')},
														
 
															+	{"char",	NORM('C')},
														
 
															+	{"continue",	CTRL('C')},
														
 
															+	{"default",	NORM('d')},
														
 
															+	{"do",		NORM('D')},
														
 
															+	{"double",	CTRL('D')},
														
 
															+	{"else",	NORM('e')},
														
 
															+	{"enum",	NORM('E')},
														
 
															+	{"extern",	CTRL('E')},
														
 
															+	{"float",	NORM('f')},
														
 
															+	{"for",		NORM('F')},
														
 
															+	{"goto",	NORM('g')},
														
 
															+	{"if",		NORM('i')},
														
 
															+	{"int",		NORM('I')},
														
 
															+	{"long",	NORM('l')},
														
 
															+	{"register",	SKIP},
														
 
															+	{"return",	NORM('r')},
														
 
															+	{"short",	NORM('s')},
														
 
															+	{"sizeof",	NORM('S')},
														
 
															+	{"static",	CTRL('S')},
														
 
															+	{"struct",	META('s')},
														
 
															+	{"switch",	META('S')},
														
 
															+	{"typedef",	NORM('t')},
														
 
															+	{"union",	NORM('u')},
														
 
															+	{"unsigned",	NORM('U')},
														
 
															+	{"void",	SKIP},
														
 
															+	{"while",	NORM('w')}
														
 
															+};
														
 
															+
														
 
															+/* Special treatment of identifiers */
														
 
															+
														
 
															+static TOKEN
														
 
															+idf2token(int hashing) {
														
 
															+	register TOKEN tk;
														
 
															+
														
 
															+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
														
 
															+	if (TOKEN_EQ(tk, IDF) && hashing) {
														
 
															+		/* return a one-token hash code */
														
 
															+		tk = idf_hashed(yytext);
														
 
															+	}
														
 
															+	return tk;
														
 
															+}
														
 
															+
														
 
															+/* Token sets for module algollike */
														
 
															+const TOKEN NonFinals[] = {
														
 
															+	IDF,		/* identifier */
														
 
															+	NORM('{'),
														
 
															+	NORM('('),
														
 
															+	NORM('a'),	/* auto */
														
 
															+	NORM('b'),	/* break */
														
 
															+	NORM('c'),	/* case */
														
 
															+	NORM('C'),	/* char */
														
 
															+	CTRL('C'),	/* continue */
														
 
															+	NORM('d'),	/* default */
														
 
															+	NORM('D'),	/* do */
														
 
															+	CTRL('D'),	/* double */
														
 
															+	NORM('E'),	/* enum */
														
 
															+	CTRL('E'),	/* extern */
														
 
															+	NORM('f'),	/* float */
														
 
															+	NORM('F'),	/* for */
														
 
															+	NORM('g'),	/* goto */
														
 
															+	NORM('i'),	/* if */
														
 
															+	NORM('I'),	/* int */
														
 
															+	NORM('l'),	/* long */
														
 
															+	NORM('r'),	/* return */
														
 
															+	NORM('s'),	/* short */
														
 
															+	CTRL('S'),	/* static */
														
 
															+	META('s'),	/* struct */
														
 
															+	META('S'),	/* switch */
														
 
															+	NORM('t'),	/* typedef */
														
 
															+	NORM('u'),	/* union */
														
 
															+	NORM('U'),	/* unsigned */
														
 
															+	NORM('w'),	/* while */
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN NonInitials[] = {
														
 
															+	NORM(')'),
														
 
															+	NORM('}'),
														
 
															+	NORM(';'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Openers[] = {
														
 
															+	NORM('{'),
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Closers[] = {
														
 
															+	NORM('}'),
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+
														
 
															+%}
														
 
															+
														
 
															+%option nounput
														
 
															+%option never-interactive
														
 
															+
														
 
															+%Start	Comment
														
 
															+
														
 
															+Layout		([ \t\r\f])
														
 
															+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
														
 
															+
														
 
															+AnyQuoted	(\\.)
														
 
															+StrChar		([^"\n\\]|{AnyQuoted})
														
 
															+ChrChar		([^'\n\\]|{AnyQuoted})
														
 
															+
														
 
															+StartComment	("/*")
														
 
															+EndComment	("*/")
														
 
															+SafeComChar	([^*\n])
														
 
															+UnsafeComChar	("*")
														
 
															+
														
 
															+Digit		([0-9a-fA-F])
														
 
															+Idf		([A-Za-z][A-Za-z0-9_]*)
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+{StartComment}	{
														
 
															+		/*	We do not have one single pattern to match a comment
														
 
															+			(although one can be written), for two reasons.
														
 
															+			The matched string might overflow lex-internal buffers
														
 
															+			like yysbuf and yytext; and the pattern would be very
														
 
															+			complicated and overtax lex.
														
 
															+			So we break up the string into safe chunks and keep
														
 
															+			track of where we are in a start condition <Comment>.
														
 
															+		*/
														
 
															+		BEGIN Comment;
														
 
															+	}
														
 
															+
														
 
															+<Comment>{SafeComChar}+	{		/* safe comment chunk */
														
 
															+	}
														
 
															+
														
 
															+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
														
 
															+	}
														
 
															+
														
 
															+<Comment>"\n"		{		/* to break up long comments */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+<Comment>{EndComment}	{		/* end-of-comment */
														
 
															+		BEGIN INITIAL;
														
 
															+	}
														
 
															+
														
 
															+\"{StrChar}*\"	{			/* strings */
														
 
															+		return_ch('"');
														
 
															+	}
														
 
															+
														
 
															+\'{ChrChar}+\'	{			/* characters */
														
 
															+		return_ch('\'');
														
 
															+	}
														
 
															+
														
 
															+^#{Layout}*include.*	{		/* ignore #include lines */
														
 
															+	}
														
 
															+
														
 
															+^#{Layout}*{Idf}	{		/* a preprocessor line */
														
 
															+		register char *idf = yytext+1;
														
 
															+
														
 
															+		/* skip layout in front of preprocessor identifier */
														
 
															+		while (*idf == ' ' || *idf == '\t') {
														
 
															+			idf++;
														
 
															+		}
														
 
															+		return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#')));
														
 
															+	}
														
 
															+
														
 
															+(0x)?{Digit}+("l"|"L")?	{		/* numeral, passed as an identifier */
														
 
															+		return_tk(IDF);
														
 
															+	}
														
 
															+
														
 
															+{Idf}/"("	{			/* identifier in front of ( */
														
 
															+		register TOKEN tk;
														
 
															+
														
 
															+		tk = idf2token(option_set('F'));
														
 
															+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
														
 
															+	}
														
 
															+
														
 
															+{Idf}	{				/* identifier */
														
 
															+		register TOKEN tk;
														
 
															+
														
 
															+		tk = idf2token(0 /* no hashing */);
														
 
															+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
														
 
															+	}
														
 
															+
														
 
															+\;	{				/* semicolon, conditionally ignored */
														
 
															+		if (option_set('f')) return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+\n	{				/* count newlines */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+{Layout}	{			/* ignore layout */
														
 
															+	}
														
 
															+
														
 
															+{ASCII95}	{			/* copy other text */
														
 
															+		return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+.	{				/* count non-ASCII chars */
														
 
															+		lex_non_ascii_cnt++;
														
 
															+	}
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+/* Language-INdependent Code */
														
 
															+
														
 
															+void
														
 
															+yystart(void) {
														
 
															+	BEGIN INITIAL;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+yywrap(void) {
														
 
															+	return 1;
														
 
															+}
														
--- a/utils/sim_pasc/compare.c
+++ b/utils/sim_pasc/compare.c
@@ -0,0 +1,198 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: compare.c,v 2.5 2001/09/28 09:03:47 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	"sim.h"
														
 
															+#include	"tokenarray.h"
														
 
															+#include	"hash.h"
														
 
															+#include	"language.h"
														
 
															+#include	"options.h"
														
 
															+#include	"add_run.h"
														
 
															+#include	"compare.h"
														
 
															+
														
 
															+static void compare1text(int, int, int);
														
 
															+static unsigned int lcs(
														
 
															+	struct text *, unsigned int, struct text **, unsigned int *,
														
 
															+	unsigned int, unsigned int
														
 
															+);
														
 
															+
														
 
															+/*	The overall structure of the routine Compare() is:
														
 
															+
														
 
															+	for all new files
														
 
															+		for all texts it must be compared to
														
 
															+			for all positions in the new file
														
 
															+				for all positions in the text
														
 
															+					for ever increasing sizes
														
 
															+						try to match and keep the best
														
 
															+*/
														
 
															+
														
 
															+void
														
 
															+Compare(void) {
														
 
															+	register int n;
														
 
															+
														
 
															+	for (n = 0; n < NumberOfNewTexts; n++) {
														
 
															+		register int first =
														
 
															+			(	option_set('S') ? NumberOfNewTexts + 1
														
 
															+			:	option_set('s') ? n + 1
														
 
															+			:	n
														
 
															+			);
														
 
															+
														
 
															+		if (option_set('e')) {
														
 
															+			/* from first to NumberOfTexts in steps */
														
 
															+			register int m;
														
 
															+
														
 
															+			for (m = first; m < NumberOfTexts; m++) {
														
 
															+				compare1text(n, m, m+1);
														
 
															+			}
														
 
															+		}
														
 
															+		else {
														
 
															+			/* from first to NumberOfTexts in one action */
														
 
															+			if (first < NumberOfTexts) {
														
 
															+				compare1text(n, first, NumberOfTexts);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+compare1text(
														
 
															+	int n,				/* text to be compared */
														
 
															+	int first,			/* first text to be compared to */
														
 
															+	int limit			/* limit text in comparison */
														
 
															+) {
														
 
															+	register unsigned int i_first = Text[first].tx_start;
														
 
															+	register unsigned int i_limit = Text[limit-1].tx_limit;
														
 
															+	register struct text *txt0 = &Text[n];
														
 
															+	register unsigned int i0 = txt0->tx_start;
														
 
															+
														
 
															+	while (	/* there may still be a useful substring */
														
 
															+		i0 + MinRunSize - 1 < txt0->tx_limit
														
 
															+	) {
														
 
															+		/* see if there really is one */
														
 
															+		struct text *txt_best;
														
 
															+		unsigned int i_best;
														
 
															+		register unsigned int size_best =
														
 
															+			lcs(txt0, i0, &txt_best, &i_best, i_first, i_limit);
														
 
															+
														
 
															+		if (size_best) {
														
 
															+			/* good run found; enter it */
														
 
															+			add_run(txt0, i0, txt_best, i_best, size_best);
														
 
															+			/* and skip it */
														
 
															+			i0 += size_best;
														
 
															+		}
														
 
															+		else {
														
 
															+			/* we try our luck at the next token */
														
 
															+			i0++;
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static unsigned int
														
 
															+lcs(	struct text *txt0,		/* input: starting position */
														
 
															+	unsigned int i0,
														
 
															+	struct text **tbp,		/* output: position of best run */
														
 
															+	unsigned int *ibp,
														
 
															+	unsigned int i_first,		/* no comparison before this pos. */
														
 
															+	unsigned int i_limit		/* no comparison after this pos. */
														
 
															+) {
														
 
															+	/*	Finds the longest common substring (not -sequence) in:
														
 
															+			txt0, starting precisely at i0 and
														
 
															+			the text between i_first and i_limit.
														
 
															+		Writes the position in tbp and ibp and returns the size.
														
 
															+		Returns 0 if no common substring is found.
														
 
															+	*/
														
 
															+	register struct text *txt1 = txt0;
														
 
															+	register unsigned int i1 = i0;
														
 
															+	register unsigned int size_best = 0;
														
 
															+	register unsigned int txt0limit = txt0->tx_limit;
														
 
															+	register unsigned int txt1limit = txt1->tx_limit;
														
 
															+
														
 
															+	while (	/* there is a next opportunity */
														
 
															+		(i1 = ForwardReference(i1))
														
 
															+	&&	/* it is still in range */
														
 
															+		i1 < i_limit
														
 
															+	) {
														
 
															+		register unsigned int min_size;
														
 
															+		register unsigned int new_size;
														
 
															+		register unsigned int j0;
														
 
															+		register unsigned int j1;
														
 
															+
														
 
															+		if (i1 < i_first) {	/* not in range */
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		/* bump txt1; we may have skipped a text or two */
														
 
															+		while (i1 >= txt1->tx_limit) {
														
 
															+			txt1++;
														
 
															+		}
														
 
															+		txt1limit = txt1->tx_limit;
														
 
															+
														
 
															+		min_size = (size_best ? size_best+1 : MinRunSize);
														
 
															+		/* are we looking at something better than we have got? */
														
 
															+		{
														
 
															+			j0 = i0 + min_size - 1;
														
 
															+			j1 = i1 + min_size - 1;
														
 
															+			if (	/* j0 still inside txt0 */
														
 
															+				j0 < txt0limit
														
 
															+			&&	/* j1 still inside txt1 */
														
 
															+				j1 < txt1limit
														
 
															+			&&	/* j0 and j1 don't overlap */
														
 
															+				j0 < j1 - min_size + 1
														
 
															+			) {
														
 
															+				/* there would be room enough */
														
 
															+				register int cnt = min_size;
														
 
															+
														
 
															+				/* does the text match? */
														
 
															+				while (	cnt
														
 
															+				&&	TOKEN_EQ(TokenArray[j0], TokenArray[j1])
														
 
															+				) {
														
 
															+					cnt--, j0--, j1--;
														
 
															+				}
														
 
															+				if (cnt) continue;	/* forget it */
														
 
															+			}
														
 
															+			else continue;			/* forget it */
														
 
															+		}
														
 
															+
														
 
															+		/* yes, we are; how long can we make it? */
														
 
															+		{
														
 
															+			register unsigned int size = min_size;
														
 
															+
														
 
															+			j0 = i0 + min_size;
														
 
															+			j1 = i1 + min_size;
														
 
															+			while (	/* j0 still inside txt0 */
														
 
															+				j0 < txt0limit
														
 
															+			&&	/* j1 still inside txt1 */
														
 
															+				j1 < txt1limit
														
 
															+			&&	/* j0 and j1 don't overlap */
														
 
															+				j0 + size < j1
														
 
															+			&&	/* tokens are the same */
														
 
															+				TOKEN_EQ(TokenArray[j0], TokenArray[j1])
														
 
															+			) {
														
 
															+				j0++, j1++, size++;
														
 
															+			}
														
 
															+			new_size = size;
														
 
															+		}
														
 
															+
														
 
															+		/*	offer the run to the Language Department which may
														
 
															+			reject it or may cut its tail
														
 
															+		*/
														
 
															+		new_size = (	MayBeStartOfRun(TokenArray[i0])
														
 
															+			   ?	CheckRun(&TokenArray[i0], new_size)
														
 
															+			   :	0
														
 
															+			   );
														
 
															+
														
 
															+		if (	/* we still have something acceptable */
														
 
															+			new_size >= MinRunSize
														
 
															+		&&	/* it is better still than what we had */
														
 
															+			new_size > size_best
														
 
															+		) {
														
 
															+			/* record it */
														
 
															+			*tbp = txt1;
														
 
															+			*ibp = i1;
														
 
															+			size_best = new_size;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return size_best;
														
 
															+}
														
--- a/utils/sim_pasc/compare.h
+++ b/utils/sim_pasc/compare.h
@@ -0,0 +1,11 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: compare.h,v 1.2 1998/01/21 14:27:47 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Compares each new text to the appropriate texts.
														
 
															+	Stores the runs found in the AISO heap.
														
 
															+	Runs contain references to positions in the input files.
														
 
															+*/
														
 
															+
														
 
															+extern void Compare(void);
														
--- a/utils/sim_pasc/debug.par
+++ b/utils/sim_pasc/debug.par
@@ -0,0 +1,20 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: debug.par,v 1.3 1998/02/03 14:28:21 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#undef	DB_FORW_REF			/* print & check forward references */
														
 
															+#undef	DB_TEXT				/* print all text parts */
														
 
															+#undef	DB_POS				/* print positions in files */
														
 
															+#undef	DB_NL_BUFF			/* print the newline count buffer */
														
 
															+#undef	DB_RUN				/* print all identified runs */
														
 
															+
														
 
															+#ifdef	lint
														
 
															+
														
 
															+#define	DB_FORW_REF
														
 
															+#define	DB_TEXT
														
 
															+#define	DB_POS
														
 
															+#define	DB_NL_BUFF
														
 
															+#define	DB_RUN
														
 
															+
														
 
															+#endif	/* lint */
														
--- a/utils/sim_pasc/error.c
+++ b/utils/sim_pasc/error.c
@@ -0,0 +1,16 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: error.c,v 2.4 1998/02/03 14:28:22 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<stdlib.h>
														
 
															+
														
 
															+#include	"sim.h"
														
 
															+#include	"error.h"
														
 
															+
														
 
															+void
														
 
															+fatal(const char *msg) {
														
 
															+	fprintf(stderr, "%s: %s\n", progname, msg);
														
 
															+	exit(1);
														
 
															+}
														
--- a/utils/sim_pasc/error.h
+++ b/utils/sim_pasc/error.h
@@ -0,0 +1,6 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: error.h,v 1.3 1998/02/03 14:28:23 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+extern void fatal(const char *msg);
														
--- a/utils/sim_pasc/hash.c
+++ b/utils/sim_pasc/hash.c
@@ -0,0 +1,386 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: hash.c,v 2.8 2005/02/20 17:03:00 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Text is compared by comparing every substring to all substrings
														
 
															+	to the right of it; this process is in essence quadratic.  However,
														
 
															+	only substrings of length at least 'MinRunSize' are of interest,
														
 
															+	which gives us the possibility to speed up this process by using
														
 
															+	a hash table.
														
 
															+
														
 
															+	For every position in the text, we construct an index which gives
														
 
															+	the next position in the text at which a run of MinRunSize tokens
														
 
															+	starts that has the same hash code, as calculated by hash1().  If
														
 
															+	there is no such run, the index is 0.  These forward references are
														
 
															+	kept in the array forward_references[].
														
 
															+
														
 
															+	To construct this array, we use a hash table last_index[] whose size
														
 
															+	is a prime and which is about 8 times smaller than the text array.
														
 
															+	The hash table last_index[] is set up such that last_index[i] is the
														
 
															+	index of the latest token with hash_code i, or 0 if there is none.
														
 
															+	This results in hash chains of an average length of 8.  See
														
 
															+	MakeForwardReferences().
														
 
															+
														
 
															+	If there is not enough room for a hash table of the proper size
														
 
															+	(which can be considerable) the hashing is not efficient any more.
														
 
															+	In that case, the forward reference table is scanned a second time,
														
 
															+	eliminating from any chain all references to runs that do not hash to
														
 
															+	the same value under a second hash function, hash2().  For the UNIX
														
 
															+	manuals this reduced the number of matches from 91.9% to 1.9% (of
														
 
															+	which 0.06% was genuine).
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<malloc.h>
														
 
															+
														
 
															+#include	"system.par"
														
 
															+#include	"debug.par"
														
 
															+#include	"sim.h"
														
 
															+#include	"error.h"
														
 
															+#include	"language.h"
														
 
															+#include	"token.h"
														
 
															+#include	"tokenarray.h"
														
 
															+#include	"options.h"
														
 
															+#include	"hash.h"
														
 
															+
														
 
															+							/* MAIN ENTRIES */
														
 
															+static unsigned int *forward_references;	/* to be filled by malloc() */
														
 
															+static int n_forward_references;
														
 
															+
														
 
															+static void make_forward_references_hash1(void);
														
 
															+static void make_forward_references_hash2(void);
														
 
															+
														
 
															+#ifdef	DB_FORW_REF
														
 
															+static void db_forward_references(const char *);
														
 
															+static void make_forward_references_hash3(void);
														
 
															+#endif
														
 
															+
														
 
															+void
														
 
															+MakeForwardReferences(void) {
														
 
															+	/*	Constructs the forward references table.
														
 
															+	*/
														
 
															+
														
 
															+	n_forward_references = TextLength();
														
 
															+	forward_references =
														
 
															+		(unsigned int *)calloc(
														
 
															+			n_forward_references, sizeof (unsigned int)
														
 
															+		);
														
 
															+	if (!forward_references) {
														
 
															+		fatal("out of memory");
														
 
															+	}
														
 
															+	make_forward_references_hash1();
														
 
															+	make_forward_references_hash2();
														
 
															+#ifdef	DB_FORW_REF
														
 
															+	make_forward_references_hash3();
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+unsigned int
														
 
															+ForwardReference(int i) {
														
 
															+	if (i <= 0 || i >= n_forward_references) {
														
 
															+		fatal("internal error, bad forward reference");
														
 
															+	}
														
 
															+	return forward_references[i];
														
 
															+}
														
 
															+
														
 
															+void
														
 
															+FreeForwardReferences(void) {
														
 
															+	free((char *)forward_references);
														
 
															+}
														
 
															+
														
 
															+							/* HASHING */
														
 
															+/*
														
 
															+	We want a hash function whose time cost does not depend on
														
 
															+	MinRunSize, which is a problem since the size of the value
														
 
															+	we derive the hash function from IS equal to MinRunSize!
														
 
															+	Therefore we base the hash function on a sample of at most 24
														
 
															+	tokens from the input string; this works at least as well in
														
 
															+	practice.  These 24 token values will result in exactly 31
														
 
															+	bits under the hashing algorithm used, which avoids an
														
 
															+	overflow test.  So this 24 bears no relation to the default
														
 
															+	run size (although the fit is surprising!)
														
 
															+*/
														
 
															+
														
 
															+#define	N_SAMPLES	24
														
 
															+#define	OPERATION	^
														
 
															+
														
 
															+/*	An alternative algorithm; does not seem to make any difference.
														
 
															+#define	N_SAMPLES	23
														
 
															+#define	OPERATION	+
														
 
															+*/
														
 
															+
														
 
															+/*	Another algorithm; not yet tested
														
 
															+#define	N_SAMPLES	24
														
 
															+#define	OPERATION	+ 613 *
														
 
															+*/
														
 
															+
														
 
															+static unsigned int *last_index;
														
 
															+static unsigned int hash_table_size;
														
 
															+static int sample_pos[N_SAMPLES];
														
 
															+
														
 
															+static unsigned int
														
 
															+prime[] = {		/* lots of hopefully suitable primes */
														
 
															+	10639,
														
 
															+	21283,
														
 
															+	42571,
														
 
															+	85147,
														
 
															+	170227,
														
 
															+	340451,
														
 
															+	680959,
														
 
															+	1361803,
														
 
															+	2723599,
														
 
															+	5447171,
														
 
															+	10894379,
														
 
															+	21788719,
														
 
															+	43577399,
														
 
															+	87154759,
														
 
															+	174309383,
														
 
															+	348618827,
														
 
															+	697237511,
														
 
															+	1394475011
														
 
															+};
														
 
															+
														
 
															+static void
														
 
															+init_hash_table(void) {
														
 
															+	register int n;
														
 
															+
														
 
															+	/* find the ideal hash table size */
														
 
															+	n = 0;
														
 
															+	while (prime[n] < TextLength()) {
														
 
															+		n++;
														
 
															+		/* this will always terminate, if prime[] is large enough */
														
 
															+	}
														
 
															+
														
 
															+	/* see if we can allocate that much space, and if not, step down */
														
 
															+	last_index = 0;
														
 
															+	while (!last_index && n >= 0) {
														
 
															+		hash_table_size = prime[n];
														
 
															+		last_index = (unsigned int *)
														
 
															+			calloc(hash_table_size, sizeof (unsigned int));
														
 
															+		n--;
														
 
															+	}
														
 
															+	if (!last_index) {
														
 
															+		fatal("out of memory");
														
 
															+	}
														
 
															+	
														
 
															+	/* find sample positions */
														
 
															+	for (n = 0; n < N_SAMPLES; n++) {
														
 
															+		/* straigh-line approximation; uninituitive as usual */
														
 
															+		sample_pos[n] = (
														
 
															+			(2 * n * (MinRunSize - 1) + (N_SAMPLES - 1))
														
 
															+		/	(2 * (N_SAMPLES - 1))
														
 
															+		);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int hash1(const TOKEN *);
														
 
															+
														
 
															+static void
														
 
															+make_forward_references_hash1(void) {
														
 
															+	register int n;
														
 
															+
														
 
															+	init_hash_table();
														
 
															+
														
 
															+	/* set up the forward references using the last_index hash table */
														
 
															+	for (n = 0; n < NumberOfTexts; n++) {
														
 
															+		register struct text *txt = &Text[n];
														
 
															+		register unsigned int j;
														
 
															+
														
 
															+		for (	/* all pos'ns in txt except the last MinRunSize-1 */
														
 
															+			j = txt->tx_start;			/* >= 1 */
														
 
															+			j + MinRunSize - 1 < txt->tx_limit;
														
 
															+			j++
														
 
															+		) {
														
 
															+			if (MayBeStartOfRun(TokenArray[j])) {
														
 
															+				register int h = hash1(&TokenArray[j]);
														
 
															+
														
 
															+				if (last_index[h]) {
														
 
															+					forward_references[last_index[h]] = j;
														
 
															+				}
														
 
															+				last_index[h] = j;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	free((char *)last_index);
														
 
															+
														
 
															+#ifdef	DB_FORW_REF
														
 
															+	db_forward_references("first hashing");
														
 
															+#endif	/* DB_FORW_REF */
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+hash1(const TOKEN *p) {
														
 
															+	/*	hash1(p) returns the hash code of the MinRunSize
														
 
															+		tokens starting at p; caller guarantees that there
														
 
															+		are at least MinRunSize tokens.
														
 
															+	*/
														
 
															+	register int32 h_val;
														
 
															+	register int n;
														
 
															+	
														
 
															+	h_val = 0;
														
 
															+	for (n = 0; n < N_SAMPLES; n++) {
														
 
															+		h_val = (h_val << 1) OPERATION TOKEN2int(p[sample_pos[n]]);
														
 
															+#if	N_SAMPLES > 24
														
 
															+		if (h_val & (1<<31)) {
														
 
															+			h_val ^= (1<<31|1);
														
 
															+		}
														
 
															+#endif
														
 
															+	}
														
 
															+	/* just in case somebody tries wrong N_SAMPLES and OPERATION values: */
														
 
															+	if (h_val < 0) fatal("corrupt hash algorithm in hash1() in hash.c");
														
 
															+
														
 
															+	return h_val % hash_table_size;
														
 
															+}
														
 
															+
														
 
															+static int hash2(const TOKEN *);
														
 
															+
														
 
															+static void
														
 
															+make_forward_references_hash2(void) {
														
 
															+	register unsigned int i;
														
 
															+
														
 
															+	/* do a second hash only if the original hash table was reduced */
														
 
															+	/*	Meanwhile, the quality of the primary hashing is so bad
														
 
															+		that we are virtually forced to always do a second scan.
														
 
															+	*/
														
 
															+
														
 
															+	/*	Clean out spurious matches, by a quadratic algorithm.
														
 
															+		Note that we do not want to eliminate overlapping
														
 
															+		sequences in this stage, since we might be removing the
														
 
															+		wrong copy.
														
 
															+	*/
														
 
															+	for (i = 0; i+MinRunSize < TextLength(); i++) {
														
 
															+		register unsigned int j = i;
														
 
															+		register int h2 = hash2(&TokenArray[i]);
														
 
															+
														
 
															+		/*	Find the first token sequence in the chain
														
 
															+			with same secondary hash code.
														
 
															+		*/
														
 
															+		while (	/* there is still a forward reference */
														
 
															+			(j = forward_references[j])
														
 
															+		&&	/* its hash code does not match */
														
 
															+			hash2(&TokenArray[j]) != h2
														
 
															+		) {
														
 
															+			/* continue searching */
														
 
															+		}
														
 
															+		/* short-circuit forward reference to it, or to zero */
														
 
															+		forward_references[i] = j;
														
 
															+	}
														
 
															+
														
 
															+#ifdef	DB_FORW_REF
														
 
															+	db_forward_references("second hashing");
														
 
															+#endif	/* DB_FORW_REF */
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+hash2(const TOKEN *p) {
														
 
															+	/*	a simple-minded hashing for the secondary sweep;
														
 
															+		first and last token combined in a short int
														
 
															+	*/
														
 
															+	return (TOKEN2int(p[0]) << 8) + TOKEN2int(p[MinRunSize-1]);
														
 
															+}
														
 
															+
														
 
															+#ifdef	DB_FORW_REF
														
 
															+
														
 
															+static int hash3(const TOKEN *, const TOKEN *);
														
 
															+
														
 
															+static void
														
 
															+make_forward_references_hash3(void) {
														
 
															+	register unsigned int i;
														
 
															+
														
 
															+	/* do a third hash to check up on the previous two */
														
 
															+
														
 
															+	/* this time we use a genuine compare */
														
 
															+	for (i = 0; i+MinRunSize < TextLength(); i++) {
														
 
															+		register unsigned int j = i;
														
 
															+
														
 
															+		while (	/* there is still a forward reference */
														
 
															+			(j = forward_references[j])
														
 
															+		&&	/* its hash code does not match */
														
 
															+			!hash3(&TokenArray[i], &TokenArray[j])
														
 
															+		) {
														
 
															+			/* continue searching */
														
 
															+		}
														
 
															+		/* short-circuit forward reference to it, or to zero */
														
 
															+		forward_references[i] = j;
														
 
															+	}
														
 
															+
														
 
															+	db_forward_references("third hashing");
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+hash3(const TOKEN *p, const TOKEN *q) {
														
 
															+	/* a full comparison for the tertiary sweep */
														
 
															+	int n;
														
 
															+	
														
 
															+	for (n = 0; n < MinRunSize; n++) {
														
 
															+		if (TOKEN2int(*(p+n)) != TOKEN2int(*(q+n))) return 0;
														
 
															+	}
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+db_frw_chain(int n, char *crossed_out) {
														
 
															+	register int chain_len = -1;
														
 
															+		/* if there are two values, the chain length is still 1 */
														
 
															+	register int fw;
														
 
															+
														
 
															+	for (fw = n; fw; fw = forward_references[fw]) {
														
 
															+		if (crossed_out[fw]) {
														
 
															+			fprintf(DebugFile,
														
 
															+				">>>> error in forward_references[] <<<<\n"
														
 
															+			);
														
 
															+		}
														
 
															+		chain_len++;
														
 
															+		crossed_out[fw]++;
														
 
															+	}
														
 
															+	fprintf(DebugFile, "n = %d, chain_len = %d\n", n, chain_len);
														
 
															+	
														
 
															+	return chain_len;
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+db_forward_references(const char *msg) {
														
 
															+	int n;
														
 
															+	int n_frw_chains = 0;		/* number of forward ref. chains */
														
 
															+	int tot_frwc_len = 0;
														
 
															+	char *crossed_out;
														
 
															+
														
 
															+	fprintf(DebugFile, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg);
														
 
															+	fprintf(DebugFile, "hash_table_size = %u\n", hash_table_size);
														
 
															+	fprintf(DebugFile, "N_SAMPLES = %d\n", N_SAMPLES);
														
 
															+
														
 
															+	crossed_out = (char *)calloc(TextLength(), sizeof (char));
														
 
															+	if (!crossed_out) {
														
 
															+		fatal(">>>> no room for db_forward_references debug table <<<<\n");
														
 
															+	}
														
 
															+
														
 
															+	/*	Each forward_references[n] starts in principle a new
														
 
															+		chain, and these chains never touch each other.
														
 
															+		We check this property by marking the positions in each
														
 
															+		chain in an array; if we meet a marked entry while
														
 
															+		following a chain, it must have been on an earlier chain
														
 
															+		and we have an error.
														
 
															+		We also determine the lengths of the chains, for statistics.
														
 
															+	*/
														
 
															+	if (forward_references[0]) {
														
 
															+		fprintf(DebugFile,
														
 
															+			">>>> forward_references[0] is not zero <<<<\n"
														
 
															+		);
														
 
															+	}
														
 
															+	for (n = 1; n < TextLength(); n++) {
														
 
															+		if (forward_references[n] && !crossed_out[n]) {
														
 
															+			/* start of a new chain */
														
 
															+			n_frw_chains++;
														
 
															+			tot_frwc_len += db_frw_chain(n, crossed_out);
														
 
															+		}
														
 
															+	}
														
 
															+	free((char *)crossed_out);
														
 
															+
														
 
															+	fprintf(DebugFile,
														
 
															+		"text length = %u, # forward chains = %d, total frw chain length = %d\n\n",
														
 
															+		TextLength(), n_frw_chains, tot_frwc_len
														
 
															+	);
														
 
															+}
														
 
															+
														
 
															+#endif	/* DB_FORW_REF */
														
--- a/utils/sim_pasc/hash.h
+++ b/utils/sim_pasc/hash.h
@@ -0,0 +1,12 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: hash.h,v 1.1 1997/06/20 12:03:14 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Creating and consulting the ForwardReference array; to speed up
														
 
															+	the Longest Substring Allgorithm.
														
 
															+*/
														
 
															+
														
 
															+extern void MakeForwardReferences(void);
														
 
															+extern void FreeForwardReferences(void);
														
 
															+extern unsigned int ForwardReference(int i);
														
--- a/utils/sim_pasc/idf.c
+++ b/utils/sim_pasc/idf.c
@@ -0,0 +1,67 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: idf.c,v 2.8 2005/02/20 17:03:00 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<string.h>
														
 
															+
														
 
															+#include	"system.par"
														
 
															+#include	"token.h"
														
 
															+#include	"idf.h"
														
 
															+
														
 
															+TOKEN
														
 
															+idf_in_list(
														
 
															+	const char *str,
														
 
															+	const struct idf list[],
														
 
															+	unsigned int listsize,
														
 
															+	TOKEN dflt
														
 
															+) {
														
 
															+	register int first = 0;
														
 
															+	register int last = (listsize / sizeof (struct idf)) - 1;
														
 
															+
														
 
															+	while (first < last) {
														
 
															+		register int middle = (first + last) / 2;
														
 
															+
														
 
															+		if (strcmp(str, list[middle].id_tag) > 0) {
														
 
															+			first = middle + 1;
														
 
															+		}
														
 
															+		else {
														
 
															+			last = middle;
														
 
															+		}
														
 
															+	}
														
 
															+	return (strcmp(str, list[first].id_tag) == 0
														
 
															+	?	list[first].id_tr
														
 
															+	:	dflt
														
 
															+	);
														
 
															+}
														
 
															+
														
 
															+TOKEN
														
 
															+idf_hashed(const char *str) {
														
 
															+	register int32 h = 0;
														
 
															+
														
 
															+	/* let's be careful about ranges; if done wrong it's hard to debug */
														
 
															+	while (*str) {
														
 
															+		/* -1 <= h <= 2^31-1 */
														
 
															+		h = (h << 1) + (*str++&0377);
														
 
															+		/* -2^31 <= h <= 2^31-1 */
														
 
															+		if (h < 0) {
														
 
															+			/* -2^31 <= h <= -1 */
														
 
															+			h += 2147483647;	/* 2^31-1 */
														
 
															+			/* -1 <= h <= 2^31-2 */
														
 
															+		}
														
 
															+		else {
														
 
															+			/* 0 <= h <= 2^31-1 */
														
 
															+		}
														
 
															+		/* -1 <= h <= 2^31-1 */
														
 
															+	}
														
 
															+	/* -1 <= h <= 2^31-1 */
														
 
															+	if (h < 0) {
														
 
															+		/* h = -1 */
														
 
															+		/* a very small chance, but all the same */
														
 
															+		h = 0;
														
 
															+	}
														
 
															+	/* 0 <= h <= 2^31-1 */
														
 
															+	h %= 253;				/* 0 <= h < 253 */
														
 
															+	return NORM(h + 1);			/* 1 <= h < 254 */
														
 
															+	/* this avoids SKIP (0) and EOL (255) */
														
 
															+}
														
--- a/utils/sim_pasc/idf.h
+++ b/utils/sim_pasc/idf.h
@@ -0,0 +1,31 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: idf.h,v 2.5 1998/02/03 14:28:25 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Idf module:
														
 
															+	TOKEN idf_in_list(char *str, struct idf l[], sizeof l, TOKEN dflt);
														
 
															+		looks up a keyword in a list of keywords l, represented as an
														
 
															+		array of struct idf, and returns its translation as a token;
														
 
															+		dflt is returned if the keyword is not found.
														
 
															+	TOKEN idf_hashed(char *str);
														
 
															+		returns a token unequal to SKIP or EOL, derived from the str
														
 
															+		through hashing
														
 
															+	It is assumed that SKIP will be ignored by the user of this module.
														
 
															+*/
														
 
															+
														
 
															+#include	"token.h"
														
 
															+
														
 
															+/* the struct for keywords etc. */
														
 
															+struct idf {
														
 
															+	char *id_tag;	/* an interesting identifier */
														
 
															+	TOKEN id_tr;	/* with its one-token translation */
														
 
															+};
														
 
															+
														
 
															+/* special tokens for the idf module */
														
 
															+#define	SKIP		NORM('\0')
														
 
															+#define	IDF		NORM('@')
														
 
															+
														
 
															+/* public functions */
														
 
															+extern TOKEN idf_in_list(const char *, const struct idf [], unsigned int, TOKEN);
														
 
															+extern TOKEN idf_hashed(const char *);
														
--- a/utils/sim_pasc/javalang.l
+++ b/utils/sim_pasc/javalang.l
@@ -0,0 +1,270 @@
 
															+%{
														
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: javalang.l,v 1.4 2007/08/29 09:10:32 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	Java language front end for the similarity tester.
														
 
															+	Author:	Dick Grune <[email protected]>
														
 
															+*/
														
 
															+
														
 
															+#include	"options.h"
														
 
															+#include	"algollike.h"
														
 
															+#include	"token.h"
														
 
															+#include	"idf.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"lang.h"
														
 
															+
														
 
															+/* Language-dependent Code */
														
 
															+
														
 
															+static const struct idf reserved[] = {
														
 
															+	{"abstract",	NORM('a')},
														
 
															+	{"boolean",	NORM('b')},
														
 
															+	{"break",	NORM('B')},
														
 
															+	{"byte",	CTRL('B')},
														
 
															+	{"case",	NORM('c')},
														
 
															+	{"catch",	NORM('C')},
														
 
															+	{"char",	CTRL('C')},
														
 
															+	{"class",	META('c')},
														
 
															+	{"continue",	META('C')},
														
 
															+	{"default",	NORM('d')},
														
 
															+	{"do",		NORM('D')},
														
 
															+	{"double",	CTRL('D')},
														
 
															+	{"else",	NORM('e')},
														
 
															+	{"extends",	NORM('E')},
														
 
															+	{"false",	NORM('g')},	/* Boolean literal */
														
 
															+	{"final",	NORM('f')},
														
 
															+	{"finally",	NORM('F')},
														
 
															+	{"float",	CTRL('F')},
														
 
															+	{"for",		META('f')},
														
 
															+	{"if",		NORM('i')},
														
 
															+	{"implements",	NORM('I')},
														
 
															+	{"import",	CTRL('I')},
														
 
															+	{"instanceof",	META('i')},
														
 
															+	{"int",		META('I')},
														
 
															+	{"interface",	MTCT('I')},
														
 
															+	{"long",	NORM('l')},
														
 
															+	{"native",	NORM('n')},
														
 
															+	{"new",		NORM('N')},
														
 
															+	{"null",	CTRL('N')},	/* null literal */
														
 
															+	{"package",	NORM('p')},
														
 
															+	{"private",	NORM('P')},
														
 
															+	{"protected",	CTRL('P')},
														
 
															+	{"public",	META('p')},
														
 
															+	{"return",	NORM('r')},
														
 
															+	{"short",	NORM('s')},
														
 
															+	{"static",	NORM('S')},
														
 
															+	{"super",	CTRL('S')},
														
 
															+	{"switch",	META('s')},
														
 
															+	{"synchronized",META('S')},
														
 
															+	{"this",	NORM('t')},
														
 
															+	{"throw",	NORM('T')},
														
 
															+	{"throws",	CTRL('T')},
														
 
															+	{"true",	META('t')},	/* Boolean literal */
														
 
															+	{"void",	NORM('v')},
														
 
															+	{"volatile",	NORM('V')},
														
 
															+	{"while",	NORM('w')}
														
 
															+};
														
 
															+
														
 
															+/* Special treatment of identifiers */
														
 
															+
														
 
															+static TOKEN
														
 
															+idf2token(int hashing) {
														
 
															+	register TOKEN tk;
														
 
															+
														
 
															+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
														
 
															+	if (TOKEN_EQ(tk, IDF) && hashing) {
														
 
															+		/* return a one-token hash code */
														
 
															+		tk = idf_hashed(yytext);
														
 
															+	}
														
 
															+	return tk;
														
 
															+}
														
 
															+
														
 
															+/* Token sets for module algollike */
														
 
															+const TOKEN NonFinals[] = {
														
 
															+	IDF,		/* identifier */
														
 
															+	NORM('{'),
														
 
															+	NORM('('),
														
 
															+	NORM('a'),	/* abstract */
														
 
															+	NORM('b'),	/* boolean */
														
 
															+	NORM('B'),	/* break */
														
 
															+	CTRL('B'),	/* byte */
														
 
															+	NORM('c'),	/* case */
														
 
															+	NORM('C'),	/* catch */
														
 
															+	CTRL('C'),	/* char */
														
 
															+	META('c'),	/* class */
														
 
															+	META('C'),	/* continue */
														
 
															+	NORM('d'),	/* default */
														
 
															+	NORM('D'),	/* do */
														
 
															+	CTRL('D'),	/* double */
														
 
															+	NORM('e'),	/* else */
														
 
															+	NORM('E'),	/* extends */
														
 
															+	NORM('f'),	/* final */
														
 
															+	NORM('F'),	/* finally */
														
 
															+	CTRL('F'),	/* float */
														
 
															+	META('f'),	/* for */
														
 
															+	NORM('i'),	/* if */
														
 
															+	NORM('I'),	/* implements */
														
 
															+	CTRL('I'),	/* import */
														
 
															+	META('i'),	/* instanceof */
														
 
															+	META('I'),	/* int */
														
 
															+	MTCT('I'),	/* interface */
														
 
															+	NORM('l'),	/* long */
														
 
															+	NORM('n'),	/* native */
														
 
															+	NORM('N'),	/* new */
														
 
															+	NORM('p'),	/* package */
														
 
															+	NORM('P'),	/* private */
														
 
															+	CTRL('P'),	/* protected */
														
 
															+	META('p'),	/* public */
														
 
															+	NORM('r'),	/* return */
														
 
															+	NORM('s'),	/* short */
														
 
															+	NORM('S'),	/* static */
														
 
															+	CTRL('S'),	/* super */
														
 
															+	META('s'),	/* switch */
														
 
															+	META('S'),	/* synchronized */
														
 
															+	NORM('T'),	/* throw */
														
 
															+	CTRL('T'),	/* throws */
														
 
															+	NORM('v'),	/* void */
														
 
															+	NORM('V'),	/* volatile */
														
 
															+	NORM('w'),	/* while */
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN NonInitials[] = {
														
 
															+	NORM(')'),
														
 
															+	NORM('}'),
														
 
															+	NORM(';'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Openers[] = {
														
 
															+	NORM('{'),
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Closers[] = {
														
 
															+	NORM('}'),
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+
														
 
															+%}
														
 
															+
														
 
															+%option nounput
														
 
															+%option never-interactive
														
 
															+
														
 
															+%Start	Comment
														
 
															+
														
 
															+Layout		([ \t\r\f])
														
 
															+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
														
 
															+
														
 
															+Digit		([0-9a-fA-F])
														
 
															+
														
 
															+UniCode		(\\u{Digit}{Digit}{Digit}{Digit})
														
 
															+AnyQuoted	((\\.)|{UniCode})
														
 
															+StrChar		([^"\n\\]|{AnyQuoted})
														
 
															+ChrChar		([^'\n\\]|{AnyQuoted})
														
 
															+
														
 
															+StartComment	("/*")
														
 
															+EndComment	("*/")
														
 
															+SafeComChar	([^*\n])
														
 
															+UnsafeComChar	("*")
														
 
															+
														
 
															+SingleLineCom	("//".*)
														
 
															+
														
 
															+Idf		([A-Za-z][A-Za-z0-9_]*)
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+{StartComment}	{
														
 
															+		/*	We do not have one single pattern to match a comment
														
 
															+			(although one can be written), for two reasons.
														
 
															+			The matched string might overflow lex-internal buffers
														
 
															+			like yysbuf and yytext; and the pattern would be very
														
 
															+			complicated and overtax lex.
														
 
															+			So we break up the string into safe chunks and keep
														
 
															+			track of where we are in a start condition <Comment>.
														
 
															+		*/
														
 
															+		BEGIN Comment;
														
 
															+	}
														
 
															+
														
 
															+<Comment>{SafeComChar}+	{		/* safe comment chunk */
														
 
															+	}
														
 
															+
														
 
															+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
														
 
															+	}
														
 
															+
														
 
															+<Comment>"\n"		{		/* to break up long comments */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+<Comment>{EndComment}	{		/* end-of-comment */
														
 
															+		BEGIN INITIAL;
														
 
															+	}
														
 
															+
														
 
															+{SingleLineCom}"\n"	{		/* single-line comment */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+\"{StrChar}*\"	{			/* strings */
														
 
															+		return_ch('"');
														
 
															+	}
														
 
															+
														
 
															+\'{ChrChar}+\'	{			/* characters */
														
 
															+		return_ch('\'');
														
 
															+	}
														
 
															+
														
 
															+(0x)?{Digit}+("l"|"L")?	{		/* numeral, passed as an identifier */
														
 
															+		return_tk(IDF);
														
 
															+	}
														
 
															+
														
 
															+"import"{Layout}[^;]*;	{		/* import statement; ignore */
														
 
															+	}
														
 
															+
														
 
															+{Idf}/"("	{			/* identifier in front of ( */
														
 
															+		register TOKEN tk;
														
 
															+
														
 
															+		tk = idf2token(option_set('F'));
														
 
															+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
														
 
															+	}
														
 
															+
														
 
															+{Idf}	{				/* identifier */
														
 
															+		register TOKEN tk;
														
 
															+
														
 
															+		tk = idf2token(0 /* no hashing */);
														
 
															+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
														
 
															+	}
														
 
															+
														
 
															+\;	{				/* semicolon, conditionally ignored */
														
 
															+		if (option_set('f')) return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+\n	{				/* count newlines */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+{Layout}	{			/* ignore layout */
														
 
															+	}
														
 
															+
														
 
															+{ASCII95}	{			/* copy other text */
														
 
															+		return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+.	{				/* count non-ASCII chars */
														
 
															+		lex_non_ascii_cnt++;
														
 
															+	}
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+/* Language-INdependent Code */
														
 
															+
														
 
															+void
														
 
															+yystart(void) {
														
 
															+	BEGIN INITIAL;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+yywrap(void) {
														
 
															+	return 1;
														
 
															+}
														
--- a/utils/sim_pasc/lang.h
+++ b/utils/sim_pasc/lang.h
@@ -0,0 +1,32 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: lang.h,v 1.2 1998/01/21 14:27:51 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	The token-providing module 'lang' has three interfaces:
														
 
															+	-	lang.h, which provides access to the lowest-level token
														
 
															+			routines, to be used by the next level.
														
 
															+	-	lex.h, which provides the lex variables, to be used by
														
 
															+			all and sundry.
														
 
															+	-	language.h, which provides language-specific info about
														
 
															+			tokens, concerning their suitability as initial
														
 
															+			and final tokens, to be used by higher levels.
														
 
															+			
														
 
															+	This structure is not satisfactory, but it is also unreasonable
														
 
															+	to combine them in one interface.
														
 
															+
														
 
															+	There is no single lang.c; rather it is represented by the
														
 
															+	various Xlang.c files generated from the Xlang.l files.
														
 
															+*/
														
 
															+
														
 
															+#include	"token.h"
														
 
															+
														
 
															+/* useful macros */
														
 
															+#define	return_tk(tk)	{lex_tk_cnt++; lex_token = (tk); return 1;}
														
 
															+#define	return_ch(ch)	{lex_tk_cnt++; lex_token = int2TOKEN((int)(ch)); return 1;}
														
 
															+#define	return_eol()	{lex_nl_cnt++; lex_token = EOL; return 1;}
														
 
															+
														
 
															+extern int yylex(void);
														
 
															+extern void yystart(void);
														
 
															+extern FILE *yyin;
														
--- a/utils/sim_pasc/language.h
+++ b/utils/sim_pasc/language.h
@@ -0,0 +1,17 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: language.h,v 1.1 1997/06/20 12:03:15 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	The abstract class Language contains the routines InitLanguage,
														
 
															+	MayBeStartOfRun and CheckRun which describe in some sense the
														
 
															+	language and which are required by compare.c.
														
 
															+	
														
 
															+	These routines must be provided by all Xlang.l files.
														
 
															+*/
														
 
															+
														
 
															+#include	"token.h"
														
 
															+
														
 
															+extern void InitLanguage(void);
														
 
															+extern int MayBeStartOfRun(TOKEN ch);
														
 
															+extern unsigned int CheckRun(const TOKEN *str, unsigned int size);
														
--- a/utils/sim_pasc/lex.c
+++ b/utils/sim_pasc/lex.c
@@ -0,0 +1,16 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: lex.c,v 1.3 1998/02/03 14:28:26 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	The communication variables, as set by yylex, NextStreamTokenObtained
														
 
															+	and NextTextTokenObtained.
														
 
															+*/
														
 
															+
														
 
															+#include	"token.h"
														
 
															+#include	"lex.h"
														
 
															+
														
 
															+TOKEN lex_token;			/* token produced, or EOL */
														
 
															+unsigned int lex_nl_cnt;		/* line count */
														
 
															+unsigned int lex_tk_cnt;		/* token position */
														
 
															+unsigned int lex_non_ascii_cnt;		/* # of non-ASCII chars found */
														
--- a/utils/sim_pasc/lex.h
+++ b/utils/sim_pasc/lex.h
@@ -0,0 +1,19 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: lex.h,v 2.5 1998/02/03 14:28:27 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Since the lex_X variables are hoisted unchanged through the levels
														
 
															+	lang, stream, and buff, to be used by pass1, pass2, etc., they
														
 
															+	have to be placed in a module of their own.
														
 
															+*/
														
 
															+
														
 
															+#include	"token.h"
														
 
															+
														
 
															+/* special tokens */
														
 
															+#define	EOL		NORM(0377)	/* end of line */
														
 
															+
														
 
															+extern TOKEN lex_token;			/* token produced, or EOL */
														
 
															+extern unsigned int lex_nl_cnt;		/* line count */
														
 
															+extern unsigned int lex_tk_cnt;		/* token position */
														
 
															+extern unsigned int lex_non_ascii_cnt;	/* # of non-ASCII chars found */
														
--- a/utils/sim_pasc/lisplang.l
+++ b/utils/sim_pasc/lisplang.l
@@ -0,0 +1,123 @@
 
															+%{
														
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: lisplang.l,v 2.9 2007/08/29 09:10:33 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	LISP language front end for the similarity tester.
														
 
															+	Author:	Gertjan Akkerman <[email protected]>
														
 
															+	Date:	Thu, 9 Apr 87 11:15:23 MDT
														
 
															+*/
														
 
															+
														
 
															+#include	"language.h"
														
 
															+#include	"token.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"lang.h"
														
 
															+
														
 
															+/* Language-dependent Code */
														
 
															+#include	"idf.h"
														
 
															+
														
 
															+static const struct idf reserved[] = {
														
 
															+	{"append",	NORM('a')},
														
 
															+	{"append1",	NORM('b')},
														
 
															+	{"atom",	NORM('t')},
														
 
															+	{"car",		NORM('h')},
														
 
															+	{"cdr",		NORM('t')},
														
 
															+	{"cond",	NORM('c')},
														
 
															+	{"cons",	NORM('s')},
														
 
															+	{"defun",	NORM('u')},
														
 
															+	{"do",		NORM('d')},
														
 
															+	{"eq",		NORM('e')},
														
 
															+	{"equal",	NORM('e')},		/* See eq */
														
 
															+	{"for",		NORM('f')},
														
 
															+	{"if",		NORM('i')},
														
 
															+	{"list",	NORM('l')},
														
 
															+	{"nconc",	NORM('n')},
														
 
															+	{"rplaca",	NORM('A')},
														
 
															+	{"rplacd",	NORM('D')}
														
 
															+};
														
 
															+
														
 
															+/* Token sets for module algollike */
														
 
															+const TOKEN NonFinals[] = {
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN NonInitials[] = {
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Openers[] = {
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Closers[] = {
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+
														
 
															+%}
														
 
															+
														
 
															+%option nounput
														
 
															+%option never-interactive
														
 
															+
														
 
															+%Start	Comment
														
 
															+
														
 
															+Layout		([ \t\r\f])
														
 
															+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
														
 
															+
														
 
															+AnyQuoted	(\\.)
														
 
															+StrChar		([^"\n\\]|{AnyQuoted})
														
 
															+ChrChar		([^'\\]|{AnyQuoted})
														
 
															+
														
 
															+IdfChar		([-!#$%&*+,/0-9:;<=>?@A-Z\\^_`a-z{}~])
														
 
															+
														
 
															+EscIdf		(({IdfChar}|\\.)+)
														
 
															+QuotIdf		("|"[^\|\n]*"|")
														
 
															+Idf		({EscIdf}|{QuotIdf})
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+";".*$	{				/* comment */
														
 
															+	}
														
 
															+
														
 
															+\"{StrChar}*\"	{			/* strings */
														
 
															+		return_ch('"');
														
 
															+	}
														
 
															+
														
 
															+{Idf}	{				/* identifier */
														
 
															+		return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF));
														
 
															+	}
														
 
															+
														
 
															+\n	{				/* count newlines */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+{Layout}	{			/* ignore layout */
														
 
															+	}
														
 
															+
														
 
															+{ASCII95}	{			/* copy other text */
														
 
															+		return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+.	{				/* count non-ASCII chars */
														
 
															+		lex_non_ascii_cnt++;
														
 
															+	}
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+/* Language-INdependent Code */
														
 
															+
														
 
															+void
														
 
															+yystart(void) {
														
 
															+	BEGIN INITIAL;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+yywrap(void) {
														
 
															+	return 1;
														
 
															+}
														
--- a/utils/sim_pasc/m2lang.l
+++ b/utils/sim_pasc/m2lang.l
@@ -0,0 +1,319 @@
 
															+%{
														
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: m2lang.l,v 2.9 2007/08/29 09:10:33 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	Modula-2 language front end for the similarity tester.
														
 
															+	Author:	Dick Grune <[email protected]>
														
 
															+*/
														
 
															+
														
 
															+#include	"options.h"
														
 
															+#include	"algollike.h"
														
 
															+#include	"token.h"
														
 
															+#include	"idf.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"lang.h"
														
 
															+
														
 
															+/* Language-dependent Code */
														
 
															+
														
 
															+/*	Most Modula-2 programs start with a number of IMPORTs that look
														
 
															+	very similar from program to program.  These are skipped by ignoring
														
 
															+	the reserved words IMPLEMENTATION, DEFINITION, MODULE, IMPORT
														
 
															+	and FROM, having a flag skip_imports, and start reacting only
														
 
															+	at the first non-ignored reserved word.
														
 
															+
														
 
															+	Also, the nesting comments require a state variable.
														
 
															+*/
														
 
															+
														
 
															+/* Additional state variables, set in yystart() */
														
 
															+static int skip_imports;
														
 
															+static int comment_level;
														
 
															+
														
 
															+/* Data for module idf */
														
 
															+
														
 
															+static const struct idf reserved[] = {
														
 
															+	{"AND",		NORM('&')},
														
 
															+	{"ARRAY",	NORM('A')},
														
 
															+	{"BEGIN",	NORM('{')},
														
 
															+	{"BY",		NORM('B')},
														
 
															+	{"CASE",	NORM('c')},
														
 
															+	{"CONST",	NORM('C')},
														
 
															+	{"DEFINITION",	SKIP},
														
 
															+	{"DIV",		NORM('/')},
														
 
															+	{"DO",		NORM('D')},
														
 
															+	{"ELSE",	NORM('e')},
														
 
															+	{"ELSIF",	NORM('e')},
														
 
															+	{"END",		NORM('}')},
														
 
															+	{"EXIT",	NORM('E')},
														
 
															+	{"EXPORT",	CTRL('E')},
														
 
															+	{"FOR",		NORM('F')},
														
 
															+	{"FROM",	SKIP},
														
 
															+	{"IF",		NORM('i')},
														
 
															+	{"IMPLEMENTATION", SKIP},
														
 
															+	{"IMPORT",	SKIP},
														
 
															+	{"IN",		NORM('I')},
														
 
															+	{"LOOP",	NORM('l')},
														
 
															+	{"MOD",		NORM('%')},
														
 
															+	{"MODULE",	SKIP},
														
 
															+	{"NOT",		NORM('~')},
														
 
															+	{"OF",		SKIP},
														
 
															+	{"OR",		NORM('O')},
														
 
															+	{"POINTER",	NORM('p')},
														
 
															+	{"PROCEDURE",	NORM('P')},
														
 
															+	{"QUALIFIED",	NORM('q')},
														
 
															+	{"RECORD",	NORM('r')},
														
 
															+	{"REPEAT",	NORM('R')},
														
 
															+	{"RETURN",	CTRL('r')},
														
 
															+	{"SET",		NORM('s')},
														
 
															+	{"THEN",	SKIP},
														
 
															+	{"TO",		NORM('t')},
														
 
															+	{"TYPE",	NORM('T')},
														
 
															+	{"UNTIL",	NORM('u')},
														
 
															+	{"VAR",		NORM('v')},
														
 
															+	{"WHILE",	NORM('w')},
														
 
															+	{"WITH",	NORM('W')},
														
 
															+};
														
 
															+
														
 
															+static const struct idf standard[] = {
														
 
															+	{"ABS",		META('a')},
														
 
															+	{"ADDRESS",	META('A')},
														
 
															+	{"ALLOCATE",	MTCT('A')},
														
 
															+	{"BITSET",	META('b')},
														
 
															+	{"BOOLEAN",	META('B')},
														
 
															+	{"CAP",		META('c')},
														
 
															+	{"CARDINAL",	META('C')},
														
 
															+	{"CHAR",	MTCT('C')},
														
 
															+	{"CHR",		META('x')},
														
 
															+	{"DEALLOCATE",	META('d')},
														
 
															+	{"DEC",		META('D')},
														
 
															+	{"EXCL",	META('e')},
														
 
															+	{"FALSE",	META('f')},
														
 
															+	{"FLOAT",	META('F')},
														
 
															+	{"HALT",	META('h')},
														
 
															+	{"HIGH",	META('H')},
														
 
															+	{"INC",		META('i')},
														
 
															+	{"INCL",	META('I')},
														
 
															+	{"INTEGER",	MTCT('I')},
														
 
															+	{"LONGCARD",	META('L')},
														
 
															+	{"LONGINT",	META('L')},
														
 
															+	{"LONGREAL",	META('L')},
														
 
															+	{"MAX",		META('m')},
														
 
															+	{"MIN",		META('M')},
														
 
															+	{"NEWPROCESS",	META('n')},
														
 
															+	{"NIL",		META('N')},
														
 
															+	{"ODD",		META('o')},
														
 
															+	{"ORD",		META('O')},
														
 
															+	{"PROC",	META('p')},
														
 
															+	{"REAL",	META('r')},
														
 
															+	{"SIZE",	META('s')},
														
 
															+	{"SYSTEM",	META('S')},
														
 
															+	{"TRANSFER",	META('t')},
														
 
															+	{"TRUE",	META('T')},
														
 
															+	{"TRUNC",	MTCT('T')},
														
 
															+	{"VAL",		META('v')},
														
 
															+	{"WORD",	META('w')}
														
 
															+};
														
 
															+
														
 
															+/* Special treatment of identifiers */
														
 
															+
														
 
															+static TOKEN
														
 
															+idf2token(int hashing) {
														
 
															+	register TOKEN tk;
														
 
															+
														
 
															+	/* the token can be on two lists, reserved and standard */
														
 
															+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
														
 
															+
														
 
															+	/* is it one of the keywords to be ignored? */
														
 
															+	if (TOKEN_EQ(tk, SKIP)) return tk;
														
 
															+
														
 
															+	/*	The statement below is a significant comment
														
 
															+		on the value of state variables.
														
 
															+	*/
														
 
															+	if (!TOKEN_EQ(tk, IDF)) {
														
 
															+		/* reserved word, stop the skipping */
														
 
															+		skip_imports = 0;
														
 
															+	}
														
 
															+	else {
														
 
															+		/* it is an identifier but not a reserved word */
														
 
															+		if (skip_imports) {
														
 
															+			/* skip it */
														
 
															+			tk = 0;
														
 
															+		}
														
 
															+		else {
														
 
															+			/* look further */
														
 
															+			tk = idf_in_list(yytext, standard, sizeof standard, IDF);
														
 
															+			if (TOKEN_EQ(tk, IDF) && hashing) {
														
 
															+				/* return a one-token hash code */
														
 
															+				tk = idf_hashed(yytext);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	return tk;
														
 
															+}
														
 
															+
														
 
															+/* Token sets for module algollike */
														
 
															+const TOKEN NonFinals[] = {
														
 
															+	IDF,		/* identifier */
														
 
															+	NORM('{'),	/* also BEGIN */
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NORM('A'),	/* ARRAY */
														
 
															+	NORM('c'),	/* CASE */
														
 
															+	NORM('C'),	/* CONST */
														
 
															+	NORM('E'),	/* EXIT */
														
 
															+	NORM('F'),	/* FOR */
														
 
															+	NORM('i'),	/* IF */
														
 
															+	NORM('l'),	/* LOOP */
														
 
															+	NORM('p'),	/* POINTER */
														
 
															+	NORM('P'),	/* PROCEDURE */
														
 
															+	NORM('r'),	/* RECORD */
														
 
															+	NORM('R'),	/* REPEAT */
														
 
															+	CTRL('R'),	/* RETURN */
														
 
															+	NORM('s'),	/* SET */
														
 
															+	NORM('T'),	/* TYPE */
														
 
															+	NORM('v'),	/* VAR */
														
 
															+	NORM('w'),	/* WHILE */
														
 
															+	NORM('W'),	/* WITH */
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN NonInitials[] = {
														
 
															+	NORM('}'),
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NORM(';'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Openers[] = {
														
 
															+	NORM('{'),
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Closers[] = {
														
 
															+	NORM('}'),
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+
														
 
															+%}
														
 
															+
														
 
															+%option nounput
														
 
															+%option never-interactive
														
 
															+
														
 
															+%Start	Comment
														
 
															+
														
 
															+Layout		([ \t\r\f])
														
 
															+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
														
 
															+
														
 
															+AnyQuoted	(\\.)
														
 
															+QuStrChar	([^"\n\\]|{AnyQuoted})
														
 
															+ApoStrChar	([^'\n\\]|{AnyQuoted})
														
 
															+
														
 
															+StartComment	("(*")
														
 
															+EndComment	("*)")
														
 
															+SafeComChar	([^*\n])
														
 
															+UnsafeComChar	("*")
														
 
															+
														
 
															+Digit		([0-9a-fA-F])
														
 
															+Idf		([A-Za-z][A-Za-z0-9_]*)
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+{StartComment}	{			/* See clang.l */
														
 
															+		/*	Lex itself is incapable of handling Modula-2's
														
 
															+			nested comments. So let's help it a bit.
														
 
															+		*/
														
 
															+		if (comment_level == 0) {
														
 
															+			BEGIN Comment;
														
 
															+		}
														
 
															+		comment_level++;
														
 
															+	}
														
 
															+
														
 
															+<Comment>{SafeComChar}+	{		/* safe comment chunk */
														
 
															+	}
														
 
															+
														
 
															+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
														
 
															+	}
														
 
															+
														
 
															+<Comment>"\n"		{		/* to break up long comments */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+<Comment>{EndComment}	{		/* end-of-comment */
														
 
															+		comment_level--;
														
 
															+		if (comment_level == 0) {
														
 
															+			BEGIN INITIAL;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+\"{QuStrChar}*\"	{		/* quoted strings */
														
 
															+		return_ch('"');
														
 
															+	}
														
 
															+
														
 
															+\'{ApoStrChar}*\'	{		/* apostrophed strings */
														
 
															+		return_ch('"');
														
 
															+	}
														
 
															+
														
 
															+{Digit}+("B"|"C"|"H")?	{		/* numeral, passed as an identifier */
														
 
															+		return_tk(IDF);
														
 
															+	}
														
 
															+
														
 
															+"END"{Layout}*{Idf}	{		/* ignore identifier after END */
														
 
															+		return_tk(idf_in_list("END", reserved, sizeof reserved, SKIP));
														
 
															+	}
														
 
															+
														
 
															+{Idf}/"("	{			/* identifier in front of ( */
														
 
															+		register TOKEN tk;
														
 
															+
														
 
															+		tk = idf2token(option_set('F'));
														
 
															+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
														
 
															+	}
														
 
															+
														
 
															+{Idf}	{				/* identifier */
														
 
															+		register TOKEN tk;
														
 
															+
														
 
															+		tk = idf2token(0 /* no hashing */);
														
 
															+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
														
 
															+	}
														
 
															+
														
 
															+"<>"	{				/* <>, special equivalence */
														
 
															+		return_ch('#');
														
 
															+	}
														
 
															+
														
 
															+\;	{				/* semicolon, conditionally ignored */
														
 
															+		if (option_set('f')) return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+\n	{				/* count newlines */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+{Layout}	{			/* ignore layout */
														
 
															+	}
														
 
															+
														
 
															+{ASCII95}	{			/* copy other text */
														
 
															+		if (!skip_imports) return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+.	{				/* count non-ASCII chars */
														
 
															+		lex_non_ascii_cnt++;
														
 
															+	}
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+/* Language-INdependent Code */
														
 
															+
														
 
															+void
														
 
															+yystart(void) {
														
 
															+	skip_imports = 1;
														
 
															+	comment_level = 0;
														
 
															+	BEGIN INITIAL;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+yywrap(void) {
														
 
															+	return 1;
														
 
															+}
														
--- a/utils/sim_pasc/miralang.l
+++ b/utils/sim_pasc/miralang.l
@@ -0,0 +1,131 @@
 
															+%{
														
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: miralang.l,v 1.3 2007/08/29 09:10:34 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	Miranda language front end for the similarity tester.
														
 
															+	Author:	Emma Norling ([email protected])
														
 
															+	Date:	Nov 1998
														
 
															+*/
														
 
															+
														
 
															+#include	"language.h"
														
 
															+#include	"token.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"lang.h"
														
 
															+
														
 
															+/* Language-dependent Code */
														
 
															+#include	"idf.h"
														
 
															+
														
 
															+static const struct idf reserved[] = {
														
 
															+	{"abstype",	NORM('a')},
														
 
															+	{"bool",	NORM('b')},
														
 
															+	{"char",	NORM('c')},
														
 
															+	{"const",	META('c')},
														
 
															+	{"div",		NORM('d')},
														
 
															+	{"False",	NORM('F')},
														
 
															+	{"if",		NORM('i')},
														
 
															+	{"mod",		NORM('m')},
														
 
															+	{"num",		NORM('n')},
														
 
															+	{"otherwise",	NORM('o')},
														
 
															+	{"readvals",	NORM('r')},
														
 
															+	{"show",	NORM('s')},
														
 
															+	{"sys_message",	META('s')},
														
 
															+	{"True",	NORM('T')},
														
 
															+	{"type",	NORM('t')},
														
 
															+	{"where",	NORM('w')},
														
 
															+	{"with",	META('w')}
														
 
															+};
														
 
															+
														
 
															+/* Token sets for module algollike */
														
 
															+const TOKEN NonFinals[] = {
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NORM('='),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN NonInitials[] = {
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Openers[] = {
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NORM('='),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Closers[] = {
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+
														
 
															+%}
														
 
															+
														
 
															+%option nounput
														
 
															+%option never-interactive
														
 
															+
														
 
															+%Start	Comment
														
 
															+
														
 
															+Layout		([ \t\r\f])
														
 
															+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
														
 
															+
														
 
															+AnyQuoted	(\\.)
														
 
															+StrChar		([^"\n\\]|{AnyQuoted})
														
 
															+ChrChar		([^'\\]|{AnyQuoted})
														
 
															+
														
 
															+Idf		([A-Za-z][A-Za-z0-9_']*)
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+"||".*$	{				/* comment */
														
 
															+	}
														
 
															+
														
 
															+\"{StrChar}*\"	{			/* strings */
														
 
															+		return_ch('"');
														
 
															+	}
														
 
															+
														
 
															+\'{ChrChar}\'	{			/* characters */
														
 
															+		return_ch('\'');
														
 
															+	}
														
 
															+
														
 
															+\%{Layout}*include.*	{		/* skip %include line */
														
 
															+	}
														
 
															+
														
 
															+\%{Layout}*insert.*	{		/* skip %insert line */
														
 
															+	}
														
 
															+
														
 
															+{Idf}	{				/* identifier */
														
 
															+		return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF));
														
 
															+	}
														
 
															+
														
 
															+\n	{				/* count newlines */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+{Layout}	{			/* ignore layout */
														
 
															+	}
														
 
															+
														
 
															+{ASCII95}	{			/* copy other text */
														
 
															+		return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+.	{				/* count non-ASCII chars */
														
 
															+		lex_non_ascii_cnt++;
														
 
															+	}
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+/* Language-INdependent Code */
														
 
															+
														
 
															+void
														
 
															+yystart(void) {
														
 
															+	BEGIN INITIAL;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+yywrap(void) {
														
 
															+	return 1;
														
 
															+}
														
--- a/utils/sim_pasc/options.c
+++ b/utils/sim_pasc/options.c
@@ -0,0 +1,123 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: options.c,v 1.3 2001/11/13 12:55:53 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<stdlib.h>
														
 
															+
														
 
															+#include	"options.h"
														
 
															+
														
 
															+static char options[128];
														
 
															+
														
 
															+static void bad_option(
														
 
															+	const char *progname, const struct option *optlist, char *msg, int c
														
 
															+);
														
 
															+static int opt_value(const struct option *op, const char *arg, char *argv[]);
														
 
															+
														
 
															+static int do_arg(
														
 
															+	const char *progname, const struct option *optlist,
														
 
															+	const char *arg, char *argv[]
														
 
															+);
														
 
															+
														
 
															+int
														
 
															+do_options(
														
 
															+	const char *progname, const struct option *optlist,
														
 
															+	int argc, char *argv[]
														
 
															+) {
														
 
															+	int skips = 0;
														
 
															+
														
 
															+	while (argc > 0 && argv[0][0] == '-' && argv[0][1] != '\0') {
														
 
															+		int consumed = do_arg(progname, optlist, &argv[0][1], argv);
														
 
															+
														
 
															+		argc -= consumed, argv += consumed, skips += consumed;
														
 
															+	}
														
 
															+
														
 
															+	return skips;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+option_set(char ch) {
														
 
															+	return options[(int)ch];
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+do_arg(
														
 
															+	const char *progname, const struct option *optlist,
														
 
															+	const char *arg, char *argv[]
														
 
															+) {
														
 
															+	int consumed = 0;
														
 
															+
														
 
															+	while (*arg) {
														
 
															+		/* treat argument character */
														
 
															+		register char opc = *arg++;
														
 
															+		register const struct option *op;
														
 
															+
														
 
															+		for (op = optlist; op->op_char; op++) {
														
 
															+			/* for every allowed option */
														
 
															+			if (opc == op->op_char) {
														
 
															+				options[(int)opc]++;
														
 
															+				if (op->op_indicator != ' ') {
														
 
															+					consumed = opt_value(op, arg, argv);
														
 
															+					if (consumed < 0) {
														
 
															+						bad_option(progname, (struct option *)0,
														
 
															+							" option -%c requires another argument",
														
 
															+							op->op_char
														
 
															+						);
														
 
															+						/*NOTREACHED*/
														
 
															+					}
														
 
															+				}
														
 
															+				break;
														
 
															+			}
														
 
															+		}
														
 
															+		if (!op->op_char) {
														
 
															+			bad_option(progname, optlist,
														
 
															+				"*option -%c unknown", opc
														
 
															+			);
														
 
															+			/*NOTREACHED*/
														
 
															+		}
														
 
															+		if (consumed) break;
														
 
															+	}
														
 
															+	if (!consumed) {
														
 
															+		consumed = 1;
														
 
															+	}
														
 
															+	
														
 
															+	return consumed;
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+opt_value(const struct option *op, const char *arg, char *argv[]) {
														
 
															+	/* locate the option value */
														
 
															+	if (*arg) {
														
 
															+		/* argument is continuation of option */
														
 
															+		*op->op_stringp = arg;
														
 
															+		return 1;
														
 
															+	}
														
 
															+	else {
														
 
															+		/* argument follows option */
														
 
															+		if (!argv[1]) return -1;
														
 
															+		*op->op_stringp = argv[1];
														
 
															+		return 2;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+bad_option(
														
 
															+	const char *progname, const struct option *optlist, char *msg, int c
														
 
															+) {
														
 
															+	fprintf(stderr, "%s: ", progname);
														
 
															+	fprintf(stderr, &msg[1], c);
														
 
															+	fprintf(stderr, "\n");
														
 
															+
														
 
															+	if (msg[0] != ' ') {
														
 
															+		register const struct option *op;
														
 
															+
														
 
															+		fprintf(stderr, "Possible options are:\n");
														
 
															+		for (op = optlist; op->op_char; op++) {
														
 
															+			fprintf(stderr, "\t-%c%c\t%s\n",
														
 
															+				op->op_char, op->op_indicator, op->op_text
														
 
															+			);
														
 
															+		}
														
 
															+	}
														
 
															+	exit(1);
														
 
															+}
														
--- a/utils/sim_pasc/options.h
+++ b/utils/sim_pasc/options.h
@@ -0,0 +1,20 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: options.h,v 1.3 2001/11/13 12:55:53 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Setting and consulting command line options
														
 
															+*/
														
 
															+
														
 
															+struct option {
														
 
															+	char op_char;		/* char as in call */
														
 
															+	char *op_text;		/* elucidating text */
														
 
															+	char op_indicator;	/* type indicator, N = int, F = file name */
														
 
															+	const char **op_stringp;/* string value to be picked up */
														
 
															+};
														
 
															+
														
 
															+extern int option_set(char ch);
														
 
															+extern int do_options(
														
 
															+	const char *progname, const struct option *optlist,
														
 
															+	int argc, char *argv[]
														
 
															+);
														
--- a/utils/sim_pasc/pascallang.l
+++ b/utils/sim_pasc/pascallang.l
@@ -0,0 +1,256 @@
 
															+%{
														
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: pascallang.l,v 2.9 2007/08/29 09:10:35 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	PASCAL language front end for the similarity tester.
														
 
															+	Author:	Maarten van der Meulen <[email protected]>
														
 
															+*/
														
 
															+
														
 
															+#include	"options.h"
														
 
															+#include	"algollike.h"
														
 
															+#include	"token.h"
														
 
															+#include	"idf.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"lang.h"
														
 
															+
														
 
															+/* Language-dependent Code */
														
 
															+
														
 
															+/* Data for module idf */
														
 
															+
														
 
															+static const struct idf ppcmd[] = {
														
 
															+	{"define",	META('d')},
														
 
															+	{"else",	META('e')},
														
 
															+	{"endif",	META('E')},
														
 
															+	{"if",		META('i')},
														
 
															+	{"ifdef",	META('I')},
														
 
															+	{"ifndef",	META('x')},
														
 
															+	{"include",	MTCT('I')},
														
 
															+	{"line",	META('l')},
														
 
															+	{"undef",	META('u')}
														
 
															+};
														
 
															+
														
 
															+static const struct idf reserved[] = {
														
 
															+	{"and",		NORM('&')},
														
 
															+	{"array",	NORM('A')},
														
 
															+	{"begin",	NORM('{')},
														
 
															+	{"case",	NORM('c')},
														
 
															+	{"const",	NORM('C')},
														
 
															+	{"div",		NORM('/')},
														
 
															+	{"do",		NORM('D')},
														
 
															+	{"downto",	NORM('d')},
														
 
															+	{"else",	NORM('e')},
														
 
															+	{"end",		NORM('}')},
														
 
															+	{"extern",	CTRL('E')},
														
 
															+	{"file",	NORM('F')},
														
 
															+	{"for",		NORM('f')},
														
 
															+	{"function",	NORM('p')},	/* Equal to procedure */
														
 
															+	{"goto",	NORM('g')},
														
 
															+	{"if",		NORM('i')},
														
 
															+	{"in",		NORM('I')},
														
 
															+	{"label",	NORM('l')},
														
 
															+	{"mod",		NORM('%')},
														
 
															+	{"nil",		NORM('n')},
														
 
															+	{"not",		NORM('!')},
														
 
															+	{"of",		SKIP},
														
 
															+	{"or",		NORM('|')},
														
 
															+	{"packed",	NORM('P')},
														
 
															+	{"procedure",	NORM('p')},
														
 
															+	{"program",	SKIP},
														
 
															+	{"record",	NORM('r')},
														
 
															+	{"repeat",	NORM('R')},
														
 
															+	{"set",		NORM('s')},
														
 
															+	{"then",	SKIP},
														
 
															+	{"to",		NORM('t')},
														
 
															+	{"type",	NORM('T')},
														
 
															+	{"until",	NORM('u')},
														
 
															+	{"var",		NORM('v')},
														
 
															+	{"while",	NORM('w')},
														
 
															+	{"with",	NORM('W')}
														
 
															+};
														
 
															+
														
 
															+/* Special treatment of identifiers */
														
 
															+
														
 
															+static void
														
 
															+lower_case(char *str) {
														
 
															+	/*	Turns upper case into lower case, since Pascal does not
														
 
															+		distinguish between them.
														
 
															+	*/
														
 
															+	register char *s;
														
 
															+
														
 
															+	for (s = str; *s; s++) {
														
 
															+		if ('A' <= *s && *s <= 'Z') {
														
 
															+			*s += (-'A' + 'a');
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static TOKEN
														
 
															+idf2token(int hashing) {
														
 
															+	register TOKEN tk;
														
 
															+
														
 
															+	lower_case(yytext);
														
 
															+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
														
 
															+	if (TOKEN_EQ(tk, IDF) && hashing) {
														
 
															+		/* return a one-token hash code */
														
 
															+		tk = idf_hashed(yytext);
														
 
															+	}
														
 
															+	return tk;
														
 
															+}
														
 
															+
														
 
															+/* Token sets for module algollike */
														
 
															+const TOKEN NonFinals[] = {
														
 
															+	IDF,		/* identifier */
														
 
															+	NORM('{'),	/* also begin */
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NORM('A'),	/* array */
														
 
															+	NORM('c'),	/* case */
														
 
															+	NORM('C'),	/* const */
														
 
															+	NORM('/'),	/* div */
														
 
															+	CTRL('E'),	/* extern */
														
 
															+	NORM('F'),	/* file */
														
 
															+	NORM('f'),	/* for */
														
 
															+	NORM('g'),	/* goto */
														
 
															+	NORM('i'),	/* if */
														
 
															+	NORM('l'),	/* label */
														
 
															+	NORM('P'),	/* packed */
														
 
															+	NORM('p'),	/* procedure/function */
														
 
															+	NORM('r'),	/* record */
														
 
															+	NORM('R'),	/* repeat */
														
 
															+	NORM('s'),	/* set */
														
 
															+	NORM('T'),	/* type */
														
 
															+	NORM('v'),	/* var */
														
 
															+	NORM('w'),	/* while */
														
 
															+	NORM('W'),	/* with */
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN NonInitials[] = {
														
 
															+	NORM(')'),
														
 
															+	NORM('}'),
														
 
															+	NORM(';'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Openers[] = {
														
 
															+	NORM('{'),
														
 
															+	NORM('('),
														
 
															+	NORM('['),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+const TOKEN Closers[] = {
														
 
															+	NORM('}'),
														
 
															+	NORM(')'),
														
 
															+	NORM(']'),
														
 
															+	NOTOKEN
														
 
															+};
														
 
															+
														
 
															+%}
														
 
															+
														
 
															+%option nounput
														
 
															+%option never-interactive
														
 
															+
														
 
															+%Start	Comment
														
 
															+
														
 
															+Layout		([ \t\r\f])
														
 
															+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
														
 
															+
														
 
															+AnyQuoted	(\\.)
														
 
															+StrChar		([^'\n\\]|{AnyQuoted})
														
 
															+
														
 
															+StartComment	("{"|"(*")
														
 
															+EndComment	("}"|"*)")
														
 
															+SafeComChar	([^*}\n])
														
 
															+UnsafeComChar	("*")
														
 
															+
														
 
															+Digit		([0-9])
														
 
															+Idf		([A-Za-z][A-Za-z0-9_]*)
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+{StartComment}	{			/* See clang.l */
														
 
															+		BEGIN Comment;
														
 
															+	}
														
 
															+
														
 
															+<Comment>{SafeComChar}+	{		/* safe comment chunk */
														
 
															+	}
														
 
															+
														
 
															+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
														
 
															+	}
														
 
															+
														
 
															+<Comment>"\n"		{		/* to break up long comments */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+<Comment>{EndComment}	{		/* end-of-comment */
														
 
															+		BEGIN INITIAL;
														
 
															+	}
														
 
															+
														
 
															+\'{StrChar}*\'	{			/* character strings */
														
 
															+		return_ch('"');
														
 
															+	}
														
 
															+
														
 
															+^#{Layout}*include.*	{		/* ignore #include lines */
														
 
															+	}
														
 
															+
														
 
															+^#{Layout}*{Idf}	{		/* a preprocessor line */
														
 
															+		register char *idf = yytext+1;
														
 
															+
														
 
															+		/* skip layout in front of preprocessor identifier */
														
 
															+		while (*idf == ' ' || *idf == '\t') {
														
 
															+			idf++;
														
 
															+		}
														
 
															+		return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#')));
														
 
															+	}
														
 
															+
														
 
															+{Digit}+	{			/* numeral, passed as an identifier */
														
 
															+		return_tk(IDF);
														
 
															+	}
														
 
															+
														
 
															+{Idf}/"("	{			/* identifier in front of ( */
														
 
															+		register TOKEN tk;
														
 
															+
														
 
															+		tk = idf2token(option_set('F'));
														
 
															+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
														
 
															+	}
														
 
															+
														
 
															+{Idf}	{				/* identifier */
														
 
															+		register TOKEN tk;
														
 
															+
														
 
															+		tk = idf2token(0 /* no hashing */);
														
 
															+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
														
 
															+	}
														
 
															+
														
 
															+\;	{				/* semicolon, conditionally ignored */
														
 
															+		if (option_set('f')) return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+\n	{				/* count newlines */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+{Layout}	{			/* ignore layout */
														
 
															+	}
														
 
															+
														
 
															+{ASCII95}	{			/* copy other text */
														
 
															+		return_ch(yytext[0]);
														
 
															+	}
														
 
															+
														
 
															+.	{				/* count non-ASCII chars */
														
 
															+		lex_non_ascii_cnt++;
														
 
															+	}
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+/* Language-INdependent Code */
														
 
															+
														
 
															+void
														
 
															+yystart(void) {
														
 
															+	BEGIN INITIAL;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+yywrap(void) {
														
 
															+	return 1;
														
 
															+}
														
--- a/utils/sim_pasc/pass1.c
+++ b/utils/sim_pasc/pass1.c
@@ -0,0 +1,119 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: pass1.c,v 2.8 2007/08/27 09:57:32 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<string.h>
														
 
															+
														
 
															+#include	"debug.par"
														
 
															+#include	"sim.h"
														
 
															+#include	"text.h"
														
 
															+#include	"tokenarray.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"error.h"
														
 
															+#include	"pass1.h"
														
 
															+
														
 
															+#ifdef	DB_TEXT
														
 
															+static void db_print_text(const struct text *);
														
 
															+#endif
														
 
															+
														
 
															+static void print_count(unsigned int cnt, const char *);
														
 
															+
														
 
															+void
														
 
															+Pass1(int argc, char *argv[]) {
														
 
															+	register int n;
														
 
															+
														
 
															+	InitText(argc);
														
 
															+	InitTokenArray();
														
 
															+
														
 
															+	/* assume all texts to be new */
														
 
															+	NumberOfNewTexts = NumberOfTexts;
														
 
															+
														
 
															+	/* read the files */
														
 
															+	for (n = 0; n < NumberOfTexts; n++) {
														
 
															+		register char *fname = argv[n];
														
 
															+		register struct text *txt = &Text[n];
														
 
															+
														
 
															+		fprintf(OutputFile, "File %s: ", fname);
														
 
															+
														
 
															+		txt->tx_fname = fname;
														
 
															+		txt->tx_pos = 0;
														
 
															+		txt->tx_start =
														
 
															+		txt->tx_limit = TextLength();
														
 
															+		if (strcmp(fname, "/") == 0) {
														
 
															+			fprintf(OutputFile, "separator\n");
														
 
															+			NumberOfNewTexts = n;
														
 
															+		}
														
 
															+		else {
														
 
															+			if (!OpenText(First, txt)) {
														
 
															+				fprintf(OutputFile, ">>>> cannot open <<<< ");
														
 
															+				/*	the file has still been opened
														
 
															+					with a null file for uniformity
														
 
															+				*/
														
 
															+			}
														
 
															+			while (NextTextTokenObtained(First)) {
														
 
															+				if (!TOKEN_EQ(lex_token, EOL)) {
														
 
															+					StoreToken();
														
 
															+				}
														
 
															+			}
														
 
															+			CloseText(First, txt);
														
 
															+			txt->tx_limit = TextLength();
														
 
															+
														
 
															+			/* report */
														
 
															+			print_count(txt->tx_limit - txt->tx_start, "token");
														
 
															+			if (lex_non_ascii_cnt) {
														
 
															+				fprintf(DebugFile, ", ");
														
 
															+				print_count(lex_non_ascii_cnt,
														
 
															+					"non-ASCII character"
														
 
															+				);
														
 
															+			}
														
 
															+			fprintf(OutputFile, "\n");
														
 
															+#ifdef	DB_TEXT
														
 
															+			db_print_text(txt);
														
 
															+#endif	/* DB_TEXT */
														
 
															+		}
														
 
															+		fflush(OutputFile);
														
 
															+	}
														
 
															+
														
 
															+	/* report total */
														
 
															+	fprintf(OutputFile, "Total: ");
														
 
															+	print_count(TextLength() - 1, "token");
														
 
															+	fprintf(OutputFile, "\n\n");
														
 
															+	fflush(OutputFile);
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+print_count(unsigned int cnt, const char *unit) {
														
 
															+	/*	Prints a grammatically correct string "%u %s[s]"
														
 
															+		for units that form their plural by suffixing -s.
														
 
															+	*/
														
 
															+	fprintf(OutputFile, "%u %s%s", cnt, unit, (cnt == 1 ? "" : "s"));
														
 
															+}
														
 
															+
														
 
															+#ifdef	DB_TEXT
														
 
															+
														
 
															+static void
														
 
															+db_print_text(const struct text *txt) {
														
 
															+	/* prints a text (in compressed form) */
														
 
															+	register int i;
														
 
															+
														
 
															+	fprintf(DebugFile, "\n\n**** DB_PRINT_TEXT ****\n");
														
 
															+
														
 
															+	fprintf(DebugFile, "File \"%s\", %u tokens, ",
														
 
															+		txt->tx_fname, txt->tx_limit - txt->tx_start
														
 
															+	);
														
 
															+	fprintf(DebugFile, "txt->tx_start = %u, txt->tx_limit = %u\n",
														
 
															+		txt->tx_start, txt->tx_limit
														
 
															+	);
														
 
															+
														
 
															+	for (i = txt->tx_start; i < txt->tx_limit; i++) {
														
 
															+		if ((i - txt->tx_start + 1) % 32 == 0) {
														
 
															+			fprintf(DebugFile, "\n");
														
 
															+		}
														
 
															+		print_token(stdout, TokenArray[i]);
														
 
															+	}
														
 
															+	fprintf(DebugFile, "\n");
														
 
															+}
														
 
															+
														
 
															+#endif	/* DB_TEXT */
														
--- a/utils/sim_pasc/pass1.h
+++ b/utils/sim_pasc/pass1.h
@@ -0,0 +1,9 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: pass1.h,v 1.3 2001/09/28 09:03:50 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Reads the input files; stores the tokens in TOKEN TokenArray[]
														
 
															+	and the input file descriptions in struct text text[].
														
 
															+*/
														
 
															+extern void Pass1(int argc, char *argv[]);
														
--- a/utils/sim_pasc/pass2.c
+++ b/utils/sim_pasc/pass2.c
@@ -0,0 +1,154 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: pass2.c,v 2.10 2004/08/05 09:49:46 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+
														
 
															+#include	"debug.par"
														
 
															+#include	"sim.h"
														
 
															+#include	"text.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"pass2.h"
														
 
															+
														
 
															+#ifdef	DB_POS
														
 
															+static void db_print_pos_list(const char *, const struct position *);
														
 
															+static void db_print_lex(const char *);
														
 
															+#endif
														
 
															+
														
 
															+static void pass2_txt(struct text *txt);
														
 
															+static int next_eol_obtained(void);
														
 
															+
														
 
															+void
														
 
															+Pass2(void) {
														
 
															+	int n;
														
 
															+
														
 
															+	for (n = 0; n < NumberOfTexts; n++) {
														
 
															+		pass2_txt(&Text[n]);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* instantiate sort_pos_list() */
														
 
															+#define	SORT_STRUCT		position
														
 
															+#define	SORT_NAME		sort_pos_list
														
 
															+#define	SORT_BEFORE(p1,p2)	((p1)->ps_tk_cnt < (p2)->ps_tk_cnt)
														
 
															+#define	SORT_NEXT		ps_next
														
 
															+#include	"sortlist.bdy"
														
 
															+
														
 
															+static void
														
 
															+pass2_txt(struct text *txt) {
														
 
															+	register struct position *pos;
														
 
															+	register unsigned int old_nl_cnt;
														
 
															+
														
 
															+	if (!txt->tx_pos)	/* no need to scan the file */
														
 
															+		return;
														
 
															+
														
 
															+	if (!OpenText(Second, txt)) {
														
 
															+		fprintf(stderr, ">>>> File %s disappeared <<<<\n",
														
 
															+			txt->tx_fname
														
 
															+		);
														
 
															+	}
														
 
															+	/* sets lex_nl_cnt and lex_tk_cnt */
														
 
															+
														
 
															+#ifdef	DB_POS
														
 
															+	db_print_pos_list("before sorting", txt->tx_pos);
														
 
															+#endif	/* DB_POS */
														
 
															+
														
 
															+	sort_pos_list(&txt->tx_pos);
														
 
															+
														
 
															+#ifdef	DB_POS
														
 
															+	db_print_pos_list("after sorting", txt->tx_pos);
														
 
															+#endif	/* DB_POS */
														
 
															+
														
 
															+#ifdef	DB_NL_BUFF
														
 
															+	db_print_nl_buff(txt->tx_nl_start, txt->tx_nl_limit);
														
 
															+#endif	/* DB_NL_BUFF */
														
 
															+
														
 
															+	old_nl_cnt = 1;
														
 
															+	pos = txt->tx_pos;
														
 
															+	while (pos) {
														
 
															+		/* we scan the pos list and the file in parallel */
														
 
															+
														
 
															+		/* find the corresponding line */
														
 
															+		while (pos->ps_tk_cnt >= lex_tk_cnt) {
														
 
															+			/* pos does not refer to this line, try the next */
														
 
															+
														
 
															+			/* shift the administration */
														
 
															+			old_nl_cnt = lex_nl_cnt;
														
 
															+			/* and get the next eol position */
														
 
															+			if (!next_eol_obtained()) {
														
 
															+				/* ouch! not enough lines! */
														
 
															+				fprintf(stderr, ">>>> File %s modified <<<<\n",
														
 
															+					txt->tx_fname
														
 
															+				);
														
 
															+				break;
														
 
															+			}
														
 
															+#ifdef	DB_POS
														
 
															+			db_print_lex(txt->tx_fname);
														
 
															+#endif	/* DB_POS */
														
 
															+		}
														
 
															+
														
 
															+		/* fill in the pos */
														
 
															+		switch (pos->ps_type) {
														
 
															+		case 0:	/* first token of run */
														
 
															+			pos->ps_nl_cnt = old_nl_cnt;
														
 
															+			break;
														
 
															+		case 1:	/* last token of run */
														
 
															+			pos->ps_nl_cnt = lex_nl_cnt;
														
 
															+			break;
														
 
															+		}
														
 
															+		/* and get the next pos */
														
 
															+		pos = pos->ps_next;
														
 
															+	}
														
 
															+
														
 
															+#ifdef	DB_POS
														
 
															+	db_print_pos_list("after scanning", txt->tx_pos);
														
 
															+#endif	/* DB_POS */
														
 
															+
														
 
															+	CloseText(Second, txt);
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+next_eol_obtained(void) {
														
 
															+	while (NextTextTokenObtained(Second)) {
														
 
															+		if (TOKEN_EQ(lex_token, EOL)) return 1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+#ifdef	DB_POS
														
 
															+
														
 
															+static void
														
 
															+db_print_pos(const struct position *pos) {
														
 
															+	fprintf(DebugFile, "pos type: %s; token count: %u",
														
 
															+		(pos->ps_type == 0 ? "first" : " last"),
														
 
															+		pos->ps_tk_cnt
														
 
															+	);
														
 
															+	fprintf(DebugFile, ", line#: ");
														
 
															+	if (pos->ps_nl_cnt == -1) {
														
 
															+		fprintf(DebugFile, "<NOT SET>");
														
 
															+	}
														
 
															+	else {
														
 
															+		fprintf(DebugFile, "%u", pos->ps_nl_cnt);
														
 
															+	}
														
 
															+	fprintf(DebugFile, "\n");
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+db_print_pos_list(const char *msg, const struct position *pos) {
														
 
															+	fprintf(DebugFile, "\n**** DB_PRINT_POS_LIST, %s ****\n", msg);
														
 
															+
														
 
															+	while (pos) {
														
 
															+		db_print_pos(pos);
														
 
															+		pos = pos->ps_next;
														
 
															+	}
														
 
															+	fprintf(DebugFile, "\n");
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+db_print_lex(const char *fn) {
														
 
															+	fprintf(DebugFile, "%s: lex_tk_cnt = %u, lex_nl_cnt = %u\n",
														
 
															+		fn, lex_tk_cnt, lex_nl_cnt);
														
 
															+}
														
 
															+
														
 
															+#endif	/* DB_POS */
														
--- a/utils/sim_pasc/pass2.h
+++ b/utils/sim_pasc/pass2.h
@@ -0,0 +1,9 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: pass2.h,v 1.2 1998/01/21 14:27:58 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Determines for each position that is part of a run, at which
														
 
															+	line number it starts and ends.
														
 
															+*/
														
 
															+extern void Pass2(void);
														
--- a/utils/sim_pasc/pass3.c
+++ b/utils/sim_pasc/pass3.c
@@ -0,0 +1,356 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: pass3.c,v 2.11 2005/02/20 17:03:03 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<string.h>
														
 
															+#include	<malloc.h>
														
 
															+
														
 
															+#include	"system.par"
														
 
															+#include	"debug.par"
														
 
															+#include	"sim.h"
														
 
															+#include	"runs.h"
														
 
															+#include	"error.h"
														
 
															+#include	"options.h"
														
 
															+#include	"pass3.h"
														
 
															+#include	"percentages.h"
														
 
															+
														
 
															+#ifdef	DB_RUN
														
 
															+#include	"tokenarray.h"
														
 
															+static void db_run(const struct run *);
														
 
															+#endif
														
 
															+
														
 
															+static FILE *open_chunk(const struct chunk *);
														
 
															+static void fill_line(FILE *, char []);
														
 
															+static void clear_line(char []);
														
 
															+static void show_runs(void);
														
 
															+static void show_run(const struct run *);
														
 
															+static void show_2C_line(const char [], const char []);
														
 
															+static void show_1C_line(FILE *, const char *);
														
 
															+static int prhead(const struct chunk *);
														
 
															+static int prs(const char *);
														
 
															+static int pru(unsigned int);
														
 
															+static int unslen(unsigned int);
														
 
															+
														
 
															+static int maxline;			/* Actual maximum line length */
														
 
															+static char *line0;			/* by malloc() */
														
 
															+static char *line1;
														
 
															+
														
 
															+void
														
 
															+Pass3(void) {
														
 
															+	if (option_set('p')) {
														
 
															+		show_percentages();
														
 
															+	}
														
 
															+	else {
														
 
															+		show_runs();
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+show_runs(void) {
														
 
															+	AisoIter iter;
														
 
															+	struct run *run;
														
 
															+
														
 
															+	maxline = PageWidth / 2 - 2;
														
 
															+	line0 = malloc((unsigned int)((maxline + 1) * sizeof (char)));
														
 
															+	line1 = malloc((unsigned int)((maxline + 1) * sizeof (char)));
														
 
															+	if (!line0 || !line1) fatal("out of memory");
														
 
															+
														
 
															+	OpenIter(&iter);
														
 
															+	while (GetAisoItem(&iter, &run)) {
														
 
															+#ifdef	DB_RUN
														
 
															+		db_run(run);
														
 
															+#endif	/* DB_RUN */
														
 
															+		show_run(run);
														
 
															+		fprintf(OutputFile, "\n");
														
 
															+	}
														
 
															+	CloseIter(&iter);
														
 
															+
														
 
															+	free(line0); line0 = 0;
														
 
															+	free(line1); line1 = 0;
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+show_run(const struct run *run) {
														
 
															+	/* The animals came in two by two ... */
														
 
															+	register const struct chunk *cnk0 = &run->rn_cn0;
														
 
															+	register const struct chunk *cnk1 = &run->rn_cn1;
														
 
															+	register unsigned int nl_cnt0 =
														
 
															+			cnk0->ch_last.ps_nl_cnt - cnk0->ch_first.ps_nl_cnt;
														
 
															+	register unsigned int nl_cnt1 =
														
 
															+			cnk1->ch_last.ps_nl_cnt - cnk1->ch_first.ps_nl_cnt;
														
 
															+	FILE *f0;
														
 
															+	FILE *f1;
														
 
															+
														
 
															+	/* display heading of chunk */
														
 
															+	if (!option_set('d')) {
														
 
															+		/* no assumptions about the lengths of the file names! */
														
 
															+		register unsigned int size = run->rn_size;
														
 
															+		register int pos = 0;
														
 
															+
														
 
															+		pos += prhead(cnk0);
														
 
															+		while (pos < maxline + 1) {
														
 
															+			pos += prs(" ");
														
 
															+		}
														
 
															+		pos += prs("|");
														
 
															+		pos += prhead(cnk1);
														
 
															+		while (pos < 2*maxline - unslen(size)) {
														
 
															+			pos += prs(" ");
														
 
															+		}
														
 
															+		fprintf(OutputFile, "[%u]\n", size);
														
 
															+	}
														
 
															+	else {
														
 
															+		(void)prhead(cnk0);
														
 
															+		fprintf(OutputFile, "\n");
														
 
															+		(void)prhead(cnk1);
														
 
															+		fprintf(OutputFile, "\n");
														
 
															+	}
														
 
															+
														
 
															+	/* stop if that suffices */
														
 
															+	if (option_set('n'))
														
 
															+		return;			/* ... had enough so soon ... */
														
 
															+
														
 
															+	/* open the files that hold the chunks */
														
 
															+	f0 = open_chunk(cnk0);
														
 
															+	f1 = open_chunk(cnk1);
														
 
															+
														
 
															+	/* display the chunks in the required format */
														
 
															+	if (!option_set('d')) {
														
 
															+		/* fill 2-column lines and print them */
														
 
															+		while (nl_cnt0 != 0 || nl_cnt1 != 0) {
														
 
															+			if (nl_cnt0) {
														
 
															+				fill_line(f0, line0);
														
 
															+				nl_cnt0--;
														
 
															+			}
														
 
															+			else {
														
 
															+				clear_line(line0);
														
 
															+			}
														
 
															+			if (nl_cnt1) {
														
 
															+				fill_line(f1, line1);
														
 
															+				nl_cnt1--;
														
 
															+			}
														
 
															+			else {
														
 
															+				clear_line(line1);
														
 
															+			}
														
 
															+			show_2C_line(line0, line1);
														
 
															+		}
														
 
															+	}
														
 
															+	else {
														
 
															+		/* display the lines in a diff(1)-like format */
														
 
															+		while (nl_cnt0--) {
														
 
															+			show_1C_line(f0, "<");
														
 
															+		}
														
 
															+		fprintf(OutputFile, "---\n");
														
 
															+		while (nl_cnt1--) {
														
 
															+			show_1C_line(f1, ">");
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* close the pertinent files */
														
 
															+	fclose(f0);
														
 
															+	fclose(f1);
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+prhead(const struct chunk *cnk) {
														
 
															+	register int pos = 0;
														
 
															+
														
 
															+	pos += prs(cnk->ch_text->tx_fname);
														
 
															+	pos += prs(": line ");
														
 
															+	pos += pru(cnk->ch_first.ps_nl_cnt);
														
 
															+	pos += prs("-");
														
 
															+	pos += pru(cnk->ch_last.ps_nl_cnt - 1);
														
 
															+	return pos;
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+prs(const char *str) {
														
 
															+	fprintf(OutputFile, "%s", str);
														
 
															+	return strlen(str);
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+pru(unsigned int u) {
														
 
															+	fprintf(OutputFile, "%u", u);
														
 
															+	return unslen(u);
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+unslen(unsigned int u) {
														
 
															+	register int res = 1;
														
 
															+
														
 
															+	while (u > 9) {
														
 
															+		u /= 10, res++;
														
 
															+	}
														
 
															+	return res;
														
 
															+}
														
 
															+
														
 
															+static FILE *
														
 
															+open_chunk(const struct chunk *cnk) {
														
 
															+	/*	opens the file in which the chunk resides, positions the
														
 
															+		file at the beginning of the chunk and returns the file pointer
														
 
															+	*/
														
 
															+	register char *fname = cnk->ch_text->tx_fname;
														
 
															+	register FILE *f = fopen(fname, "r");
														
 
															+	register unsigned int nl_cnt;
														
 
															+
														
 
															+	if (!f) {
														
 
															+		fprintf(stderr, ">>>> File %s disappeared <<<<\n", fname);
														
 
															+		f = fopen(NULLFILE, "r");
														
 
															+	}
														
 
															+
														
 
															+	nl_cnt = cnk->ch_first.ps_nl_cnt;
														
 
															+	while (nl_cnt > 1) {
														
 
															+		int ch = getc(f);
														
 
															+
														
 
															+		if (ch < 0) break;
														
 
															+		if (ch == '\n') {
														
 
															+			nl_cnt--;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return f;
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+fill_line(FILE *f, char ln[]) {
														
 
															+	/*	Reads one line from f and puts it in condensed form in ln.
														
 
															+	*/
														
 
															+	register int indent = 0, lpos = 0;
														
 
															+	register int ch;
														
 
															+
														
 
															+	/* condense and skip initial blank */
														
 
															+	while ((ch = getc(f)), ch == ' ' || ch == '\t') {
														
 
															+		if (ch == '\t') {
														
 
															+			indent = 8;
														
 
															+		}
														
 
															+		else {
														
 
															+			indent++;
														
 
															+		}
														
 
															+		if (indent == 8) {
														
 
															+			/* every eight blanks give one blank */
														
 
															+			if (lpos < maxline) {
														
 
															+				ln[lpos++] = ' ';
														
 
															+			}
														
 
															+			indent = 0;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* store the rest */
														
 
															+	while (ch >= 0 && ch != '\n') {
														
 
															+		if (ch == '\t') {
														
 
															+			/* replace tabs by blanks */
														
 
															+			ch = ' ';
														
 
															+		}
														
 
															+		if (lpos < maxline) {
														
 
															+			ln[lpos++] = ch;
														
 
															+		}
														
 
															+		ch = getc(f);
														
 
															+	}
														
 
															+	ln[lpos] = '\0';		/* always room for this one */
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+clear_line(char ln[]) {
														
 
															+	/* a simple null byte will suffice */
														
 
															+	ln[0] = '\0';
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+show_2C_line(const char ln0[], const char ln1[]) {
														
 
															+	/*	displays the contents of the two lines in a two-column
														
 
															+		format
														
 
															+	*/
														
 
															+	register int i;
														
 
															+
														
 
															+	for (i = 0; i < maxline && ln0[i] != '\0'; i++) {
														
 
															+		fputc(ln0[i], OutputFile);
														
 
															+	}
														
 
															+	for (; i < maxline; i++) {
														
 
															+		fputc(' ', OutputFile);
														
 
															+	}
														
 
															+	fprintf(OutputFile, " |");
														
 
															+
														
 
															+	for (i = 0; i < maxline && ln1[i] != '\0'; i++) {
														
 
															+		fputc(ln1[i], OutputFile);
														
 
															+	}
														
 
															+	fprintf(OutputFile, "\n");
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+show_1C_line(FILE *f, const char *marker) {
														
 
															+	/*	displays one line from f, preceded by the marker
														
 
															+	*/
														
 
															+	register int ch;
														
 
															+
														
 
															+	fprintf(OutputFile, "%s", marker);
														
 
															+	while ((ch = getc(f)), ch > 0 && ch != '\n') {
														
 
															+		fputc(ch, OutputFile);
														
 
															+	}
														
 
															+	fputc('\n', OutputFile);
														
 
															+}
														
 
															+
														
 
															+#ifdef	DB_RUN
														
 
															+
														
 
															+static void db_chunk(const struct chunk *);
														
 
															+
														
 
															+static void
														
 
															+db_run(const struct run *run) {
														
 
															+	/* prints detailed data about a run */
														
 
															+	register const struct chunk *cnk0 = &run->rn_cn0;
														
 
															+	register const struct chunk *cnk1 = &run->rn_cn1;
														
 
															+
														
 
															+	fprintf(DebugFile, "File %s / file %s:\n",
														
 
															+		cnk0->ch_text->tx_fname,
														
 
															+		cnk1->ch_text->tx_fname
														
 
															+	);
														
 
															+	fprintf(DebugFile, "from token %u/%u to %u/%u:",
														
 
															+		cnk0->ch_first.ps_tk_cnt, cnk1->ch_first.ps_tk_cnt,
														
 
															+		cnk0->ch_last.ps_tk_cnt, cnk1->ch_last.ps_tk_cnt
														
 
															+	);
														
 
															+	fprintf(DebugFile, " from lines %u/%u to %u/%u:",
														
 
															+		cnk0->ch_first.ps_nl_cnt, cnk1->ch_first.ps_nl_cnt,
														
 
															+		cnk0->ch_last.ps_nl_cnt, cnk1->ch_last.ps_nl_cnt
														
 
															+	);
														
 
															+	fprintf(DebugFile, " %u %s\n",
														
 
															+		run->rn_size,
														
 
															+		(run->rn_size == 1 ? "token" : "tokens")
														
 
															+	);
														
 
															+
														
 
															+	db_chunk(cnk0);
														
 
															+	db_chunk(cnk1);
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+db_chunk(const struct chunk *cnk) {
														
 
															+	/*	print the tokens in the chunk, with a one-char margin
														
 
															+	*/
														
 
															+	unsigned int i;
														
 
															+	const struct position *first = &cnk->ch_first;
														
 
															+	const struct position *last = &cnk->ch_last;
														
 
															+	unsigned int start = cnk->ch_text->tx_start;
														
 
															+
														
 
															+	if (first->ps_tk_cnt > 0) {
														
 
															+		fprintf(DebugFile, "...");
														
 
															+		print_token(stdout, TokenArray[start + first->ps_tk_cnt - 1]);
														
 
															+		fprintf(DebugFile, "  ");
														
 
															+	}
														
 
															+	else {	/* create same offset as above */
														
 
															+		fprintf(DebugFile, "       ");
														
 
															+	}
														
 
															+
														
 
															+	for (i = first->ps_tk_cnt; i <= last->ps_tk_cnt; i++) {
														
 
															+		print_token(stdout, TokenArray[start + i]);
														
 
															+	}
														
 
															+
														
 
															+	if (start + last->ps_tk_cnt + 1 < cnk->ch_text->tx_limit) {
														
 
															+		fprintf(DebugFile, "  ");
														
 
															+		print_token(stdout, TokenArray[start + last->ps_tk_cnt + 1]);
														
 
															+		fprintf(DebugFile, "...");
														
 
															+	}
														
 
															+
														
 
															+	fprintf(DebugFile, "\n");
														
 
															+}
														
 
															+
														
 
															+#endif	/* DB_RUN */
														
--- a/utils/sim_pasc/pass3.h
+++ b/utils/sim_pasc/pass3.h
@@ -0,0 +1,7 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: pass3.h,v 1.2 1998/01/21 14:28:01 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Print the contents of runs */
														
 
															+extern void Pass3(void);
														
--- a/utils/sim_pasc/percentages.c
+++ b/utils/sim_pasc/percentages.c
@@ -0,0 +1,115 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: percentages.c,v 1.3 2007/08/27 09:57:33 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<malloc.h>
														
 
															+
														
 
															+#include	"sim.h"
														
 
															+#include	"runs.h"
														
 
															+#include	"error.h"
														
 
															+#include	"percentages.h"
														
 
															+
														
 
															+struct match {
														
 
															+	struct match *ma_next;
														
 
															+	struct text *ma_text0;
														
 
															+	struct text *ma_text1;
														
 
															+	unsigned int ma_size;
														
 
															+};
														
 
															+
														
 
															+static struct match *match_start;	/* to be allocated by malloc */
														
 
															+
														
 
															+int
														
 
															+add_to_percentages(struct run *r) {
														
 
															+	/* fails if out of memory, in line with add_to_run() */
														
 
															+	struct match **match_hook = &match_start;
														
 
															+
														
 
															+	/* percentages are only meaningful between different files */
														
 
															+	if (r->rn_cn0.ch_text == r->rn_cn1.ch_text) return 1;
														
 
															+
														
 
															+	/* look (text0, text1) combination up in match list */
														
 
															+	while (*match_hook) {
														
 
															+		struct match *m = *match_hook;
														
 
															+
														
 
															+		if (	m->ma_text0 == r->rn_cn0.ch_text
														
 
															+		&&	m->ma_text1 == r->rn_cn1.ch_text
														
 
															+		) {
														
 
															+			/* found it; now update it */
														
 
															+			m->ma_size += r->rn_size;
														
 
															+			return 1;
														
 
															+		}
														
 
															+		match_hook = &m->ma_next;
														
 
															+	}
														
 
															+
														
 
															+	{	/* it's not there; make a new entry */
														
 
															+		struct match *m = *match_hook =
														
 
															+			(struct match *)malloc(sizeof (struct match));
														
 
															+
														
 
															+		if (m == 0) return 0;
														
 
															+		m->ma_next = 0;
														
 
															+		m->ma_text0 = r->rn_cn0.ch_text;
														
 
															+		m->ma_text1 = r->rn_cn1.ch_text;
														
 
															+		m->ma_size = r->rn_size;
														
 
															+		return 1;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+add_reverse_entries_to_match_list(void) {
														
 
															+	struct match **match_hook = &match_start;
														
 
															+
														
 
															+	while (*match_hook) {
														
 
															+		struct match *m = *match_hook;
														
 
															+		struct match *n =
														
 
															+			(struct match *)malloc(sizeof (struct match));
														
 
															+
														
 
															+		if (!n) fatal("out of memory");
														
 
															+		/* hook in the double */
														
 
															+		n->ma_next = m->ma_next;
														
 
															+		m->ma_next = n;
														
 
															+		n->ma_text0 = m->ma_text1;
														
 
															+		n->ma_text1 = m->ma_text0;
														
 
															+		n->ma_size = m->ma_size;
														
 
															+
														
 
															+		match_hook = &n->ma_next;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static float
														
 
															+match_percentage(struct match *m) {
														
 
															+	struct text *text0 = m->ma_text0;
														
 
															+	int size0 = text0->tx_limit - text0->tx_start;
														
 
															+
														
 
															+	return (m->ma_size*1.0/size0);
														
 
															+}
														
 
															+
														
 
															+/* instantiate sort_match_list() */
														
 
															+#define	SORT_STRUCT		match
														
 
															+#define	SORT_NAME		sort_match_list
														
 
															+#define	SORT_BEFORE(p1,p2)	(match_percentage(p1) > match_percentage(p2))
														
 
															+#define	SORT_NEXT		ma_next
														
 
															+#include	"sortlist.bdy"
														
 
															+
														
 
															+static void
														
 
															+print_percentages(void) {
														
 
															+	struct match *m = match_start;
														
 
															+
														
 
															+	while (m) {
														
 
															+		fprintf(OutputFile,
														
 
															+			"%s consists for %d %% of %s material\n",
														
 
															+			m->ma_text0->tx_fname,
														
 
															+			(int)(match_percentage(m)*100.0),
														
 
															+			m->ma_text1->tx_fname
														
 
															+		);
														
 
															+		
														
 
															+		m = m->ma_next;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+void
														
 
															+show_percentages(void) {
														
 
															+	add_reverse_entries_to_match_list();
														
 
															+	sort_match_list(&match_start);
														
 
															+	print_percentages();
														
 
															+}
														
--- a/utils/sim_pasc/percentages.h
+++ b/utils/sim_pasc/percentages.h
@@ -0,0 +1,7 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: percentages.h,v 1.2 2004/08/05 09:49:48 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+extern int add_to_percentages(struct run *r);
														
 
															+extern void show_percentages(void);
														
--- a/utils/sim_pasc/runs.c
+++ b/utils/sim_pasc/runs.c
@@ -0,0 +1,11 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: runs.c,v 1.2 2001/11/08 12:30:30 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	"sim.h"
														
 
															+#include	"runs.h"
														
 
															+
														
 
															+#define	AISO_BEFORE(r0,r1)	((r0)->rn_size > (r1)->rn_size)
														
 
															+
														
 
															+#include	"aiso.bdy"
														
--- a/utils/sim_pasc/runs.h
+++ b/utils/sim_pasc/runs.h
@@ -0,0 +1,33 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: runs.h,v 1.2 2001/11/08 12:30:30 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Although all other segments of data in this program are described by
														
 
															+	giving the position of the first in the segment and that of the
														
 
															+	first not in the segment (so the size is the difference of the two),
														
 
															+	a `chunk' is given by first and last. This is done because later on we
														
 
															+	are interested in the actual position of the last token of it, and
														
 
															+	the position of the first token not in the segment gives no
														
 
															+	indication about that.
														
 
															+*/
														
 
															+
														
 
															+struct chunk {
														
 
															+	/* a chunk of text in various representations */
														
 
															+	struct text *ch_text;		/* pointer to the file */
														
 
															+	struct position ch_first;	/* first in chunk */
														
 
															+	struct position ch_last;	/* last in chunk */
														
 
															+};
														
 
															+
														
 
															+struct run {				/* a 'run' of coincident tokens */
														
 
															+	struct chunk rn_cn0;		/* chunk in left file */
														
 
															+	struct chunk rn_cn1;		/* chunk in right file */
														
 
															+	unsigned int rn_size;
														
 
															+};
														
 
															+
														
 
															+#define	AISO_TYPE	struct run *
														
 
															+#define	AISO_ITERATOR
														
 
															+
														
 
															+#define	add_to_runs(r)	InsertAiso(r)
														
 
															+
														
 
															+#include	"aiso.spc"
														
--- a/utils/sim_pasc/settings.par
+++ b/utils/sim_pasc/settings.par
@@ -0,0 +1,8 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: settings.par,v 1.1 1997/06/20 12:03:22 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#define	DFLT_MIN_RUN_SIZE	24	/* default minimum run size */
														
 
															+
														
 
															+#define	DFLT_PAGE_WIDTH		80	/* default page width */
														
--- a/utils/sim_pasc/sim.1
+++ b/utils/sim_pasc/sim.1
@@ -0,0 +1,176 @@
 
															+.\"	This file is part of the software similarity tester SIM.
														
 
															+.\"	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+.\"	$Id: sim.1,v 2.6 2004/08/05 09:49:49 dick Exp $
														
 
															+.\"
														
 
															+.TH SIM 1 2001/11/13 "Vrije Universiteit"
														
 
															+.SH NAME
														
 
															+sim \- find similarities in C, Java, Pascal, Modula-2, Lisp, Miranda or text files
														
 
															+.SH SYNOPSIS
														
 
															+.B sim_c
														
 
															+[
														
 
															+.B \-[defFnpsS]
														
 
															+.B \-r
														
 
															+.I N
														
 
															+.B \-w
														
 
															+.I N
														
 
															+.B \-o
														
 
															+.I F
														
 
															+]
														
 
															+file ... [
														
 
															+.B /
														
 
															+[ file ... ] ]
														
 
															+.br
														
 
															+.B sim_c
														
 
															+\&...
														
 
															+.br
														
 
															+.B sim_java
														
 
															+\&...
														
 
															+.br
														
 
															+.B sim_pasc
														
 
															+\&...
														
 
															+.br
														
 
															+.B sim_m2
														
 
															+\&...
														
 
															+.br
														
 
															+.B sim_lisp
														
 
															+\&...
														
 
															+.br
														
 
															+.B sim_mira
														
 
															+\&...
														
 
															+.br
														
 
															+.B sim_text
														
 
															+\&...
														
 
															+.br
														
 
															+.SH DESCRIPTION
														
 
															+.I Sim_c
														
 
															+reads the C files
														
 
															+.I file ...
														
 
															+and looks for pieces of text that are similar; two pieces of program text
														
 
															+are similar if they only differ in layout, comment, identifiers and
														
 
															+the contents of numbers, strings and characters.
														
 
															+If any runs of sufficient length
														
 
															+are found, they are reported on standard output; the number of significant
														
 
															+tokens in the run is given between square brackets.
														
 
															+.PP
														
 
															+.I Sim_java
														
 
															+does the same for Java,
														
 
															+.I sim_pasc
														
 
															+for Pascal,
														
 
															+.I sim_m2
														
 
															+for Modula-2,
														
 
															+.I sim_lisp
														
 
															+for Lisp, and
														
 
															+.I sim_mira
														
 
															+for Miranda.
														
 
															+.I Sim_text
														
 
															+works on arbitrary text; it is occasionally useful on shell scripts.
														
 
															+.PP
														
 
															+The program can be used for finding copied pieces of code in
														
 
															+purportedly unrelated programs (with
														
 
															+.B \-s
														
 
															+or
														
 
															+.BR \-S ),
														
 
															+or for finding accidentally duplicated code in larger projects (with
														
 
															+.BR \-f ).
														
 
															+.PP
														
 
															+If a
														
 
															+.B /
														
 
															+is present between the input files, the latter are divided into a group of
														
 
															+"new" files (before the
														
 
															+.BR / )
														
 
															+and a group of "old" files; if there is no
														
 
															+.BR / ,
														
 
															+all files are "new".
														
 
															+Old files are never compared to each other.
														
 
															+Since the similarity tester
														
 
															+reads the files several times, it cannot read from standard input.
														
 
															+.PP
														
 
															+There are the following options:
														
 
															+.TP
														
 
															+.B \-d
														
 
															+The output is in a diff(1)-like format instead of the default
														
 
															+2-column format.
														
 
															+.TP
														
 
															+.B \-e
														
 
															+Each file is compared to each file in isolation; this will find all
														
 
															+similarities between all texts involved, regardless of duplicates.
														
 
															+.TP
														
 
															+.B \-f
														
 
															+Runs are restricted to pieces with balancing parentheses, to isolate
														
 
															+potential functions (C, Java, Pascal, Modula-2 and Lisp only).
														
 
															+.TP
														
 
															+.B \-F
														
 
															+The names of functions in calls are required to match exactly
														
 
															+(C, Java, Pascal, Modula-2 and Lisp only).
														
 
															+.TP
														
 
															+.B \-n
														
 
															+Similarities found are only summarized, not displayed.
														
 
															+.TP
														
 
															+.B "\-o F"
														
 
															+The output is written to the file named
														
 
															+.I F.
														
 
															+.TP
														
 
															+.B \-p
														
 
															+The output is given in similarity percentages; see below.
														
 
															+.TP
														
 
															+.B "\-r N"
														
 
															+The minimum run length is set to
														
 
															+.I N
														
 
															+(default is
														
 
															+.I N
														
 
															+= 24).
														
 
															+.TP
														
 
															+.B \-s
														
 
															+The contents of a file are not compared to itself (\-s = not self).
														
 
															+.TP
														
 
															+.B \-S
														
 
															+The contents of the new files are compared to the old files only \- not
														
 
															+between themselves.
														
 
															+.TP
														
 
															+.B "\-w N"
														
 
															+The page width used is set to
														
 
															+.I N
														
 
															+columns (default is
														
 
															+.I N
														
 
															+= 80).
														
 
															+.PP
														
 
															+The
														
 
															+.B \-p
														
 
															+option results in lines of the form
														
 
															+.DS
														
 
															+.ft 5
														
 
															+F consists for x % of G material
														
 
															+.ft P
														
 
															+.DE
														
 
															+meaning that \f5x\fP % of \f5F\fP's text can also be found in \f5G\fP.
														
 
															+Note that this relation is not symmetric; it is in fact quite possible for one
														
 
															+file to consist for 100 % of text from another file, while the other file
														
 
															+consists for only 1 % of text of the first file, if their lengths differ
														
 
															+enough.
														
 
															+Note also that the granularity of the recognized text is still governed by the
														
 
															+.B \-r
														
 
															+option or its default.
														
 
															+.PP
														
 
															+Care has been taken to keep all internal processes linear in the length of the
														
 
															+input, with the exception of the matching process which is almost linear,
														
 
															+using a hash table; various other tables are used for speed-up.
														
 
															+If, however, there is not enough memory for the tables, they are discarded in
														
 
															+order of unimportance, under which conditions the algorithms revert to their
														
 
															+quadratic nature.
														
 
															+.SH AUTHOR
														
 
															+Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+.SH BUGS
														
 
															+Strong periodicity in the input text (like a table of
														
 
															+.I N
														
 
															+almost identical lines) causes problems.
														
 
															+.I Sim
														
 
															+tries to cope with this but cannot avoid giving appr.\&
														
 
															+.I log N
														
 
															+messages about it.
														
 
															+The best advice is still to take the offending files out of the game.
														
 
															+.PP
														
 
															+Since it uses
														
 
															+.I lex(1)
														
 
															+on some systems, it may dump core on any weird construction that overflows
														
 
															+.IR lex 's
														
 
															+internal buffers.
														
--- a/utils/sim_pasc/sim.c
+++ b/utils/sim_pasc/sim.c
@@ -0,0 +1,149 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: sim.c,v 2.12 2007/08/27 09:57:34 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<stdlib.h>
														
 
															+
														
 
															+#include	"settings.par"
														
 
															+#include	"sim.h"
														
 
															+#include	"options.h"
														
 
															+#include	"language.h"
														
 
															+#include	"error.h"
														
 
															+#include	"hash.h"
														
 
															+#include	"compare.h"
														
 
															+#include	"pass1.h"
														
 
															+#include	"pass2.h"
														
 
															+#include	"pass3.h"
														
 
															+#include	"stream.h"
														
 
															+#include	"lex.h"
														
 
															+
														
 
															+unsigned int MinRunSize = DFLT_MIN_RUN_SIZE;
														
 
															+int PageWidth = DFLT_PAGE_WIDTH;
														
 
															+FILE *OutputFile;
														
 
															+FILE *DebugFile;
														
 
															+
														
 
															+struct text *Text;			/* to be filled in by malloc */
														
 
															+int NumberOfTexts;			/* number of text records */
														
 
															+int NumberOfNewTexts;			/* number of new text records */
														
 
															+
														
 
															+char *progname;				/* for error reporting */
														
 
															+
														
 
															+static const char *outputname;		/* for reporting */
														
 
															+static const char *minrunstring;
														
 
															+static const char *pagewidthstring;
														
 
															+
														
 
															+static const struct option optlist[] = {
														
 
															+	{'r', "minimum run size", 'N', &minrunstring},
														
 
															+	{'w', "page width", 'N', &pagewidthstring},
														
 
															+	{'f', "function-like forms only", ' ', 0},
														
 
															+	{'d', "use diff format for output", ' ', 0},
														
 
															+	{'p', "use percentage format for output", ' ', 0},
														
 
															+	{'e', "compare each file to each file separately", ' ', 0},
														
 
															+	{'s', "do not compare a file to itself", ' ', 0},
														
 
															+	{'S', "compare new files to old files only", ' ', 0},
														
 
															+	{'F', "keep function identifiers in tact", ' ', 0},
														
 
															+	{'n', "display headings only", ' ', 0},
														
 
															+	{'x', "no pass2 nl_buff allocation", ' ', 0},
														
 
															+	{'o', "write output to file F", 'F', &outputname},
														
 
															+	{'-', "lexical scan output only", ' ', 0},
														
 
															+	{0, 0, 0, 0}
														
 
															+};
														
 
															+
														
 
															+static void print_stream(const char *fname);
														
 
															+
														
 
															+int
														
 
															+main(int argc, char *argv[]) {
														
 
															+	progname = argv[0];		/* save program name */
														
 
															+	argv++, argc--;			/* and skip it */
														
 
															+
														
 
															+	/* Set the default output and debug streams */
														
 
															+	OutputFile = stdout;
														
 
															+	DebugFile = stdout;
														
 
															+
														
 
															+	/* Get command line options */
														
 
															+	{	int nop = do_options(progname, optlist, argc, argv);
														
 
															+		argc -= nop, argv += nop;	/* skip them */
														
 
															+	}
														
 
															+
														
 
															+	/* Treat the value options */
														
 
															+	if (minrunstring) {
														
 
															+		MinRunSize = strtoul(minrunstring, NULL, 10);
														
 
															+		if (MinRunSize == 0) fatal("bad or zero run size; form is: -r N");
														
 
															+	}
														
 
															+	if (pagewidthstring) {
														
 
															+		PageWidth = atoi(pagewidthstring);
														
 
															+		if (PageWidth == 0) fatal("bad or zero page width; form is: -w N");
														
 
															+	}
														
 
															+	if (outputname) {
														
 
															+		OutputFile = fopen(outputname, "w");
														
 
															+		if (OutputFile == 0) {
														
 
															+			char msg[500];
														
 
															+
														
 
															+			sprintf(msg, "cannot open output file %s", outputname);
														
 
															+			fatal(msg);
														
 
															+			/*NOTREACHED*/
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (option_set('-')) {
														
 
															+		/* it is the lexical scan only */
														
 
															+		while (argv[0]) {
														
 
															+			print_stream(argv[0]);
														
 
															+			argv++;
														
 
															+		}
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	/* Start processing */
														
 
															+	InitLanguage();
														
 
															+
														
 
															+	/* Read the input files */
														
 
															+	Pass1(argc, argv);
														
 
															+
														
 
															+	/* Set up the forward reference table */
														
 
															+	MakeForwardReferences();
														
 
															+
														
 
															+	/* Compare the input files to find runs */
														
 
															+	Compare();
														
 
															+
														
 
															+	/* Delete forward reference table */
														
 
															+	FreeForwardReferences();
														
 
															+
														
 
															+	/* Find positions of the runs found */
														
 
															+	Pass2();
														
 
															+
														
 
															+	/* Print the similarities */
														
 
															+	Pass3();
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+print_stream(const char *fname) {
														
 
															+	fprintf(OutputFile, "File %s:", fname);
														
 
															+	if (!OpenStream(fname)) {
														
 
															+		fprintf(OutputFile, " cannot open\n");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	fprintf(OutputFile, " showing token stream:\nnl_cnt, tk_cnt: tokens");
														
 
															+
														
 
															+	lex_token = EOL;
														
 
															+	do {
														
 
															+		if (TOKEN_EQ(lex_token, EOL)) {
														
 
															+			fprintf(OutputFile, "\n%u,%u:",
														
 
															+				lex_nl_cnt, lex_tk_cnt
														
 
															+			);
														
 
															+		}
														
 
															+		else {
														
 
															+			print_token(OutputFile, lex_token);
														
 
															+		}
														
 
															+	} while (NextStreamTokenObtained());
														
 
															+
														
 
															+	fprintf(OutputFile, "\n");
														
 
															+
														
 
															+	CloseStream();
														
 
															+
														
 
															+}
														
--- a/utils/sim_pasc/sim.h
+++ b/utils/sim_pasc/sim.h
@@ -0,0 +1,39 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: sim.h,v 2.7 2005/02/20 17:03:03 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+
														
 
															+struct position {
														
 
															+	/* position of first and last token of a chunk */
														
 
															+	struct position *ps_next;
														
 
															+	int ps_type;		/* first = 0, last = 1 */
														
 
															+	unsigned int ps_tk_cnt;	/* in tokens; set by add_run() in Pass 1 */
														
 
															+	unsigned int ps_nl_cnt;	/* same, in line numbers; set by Pass2(),
														
 
															+				   used by Pass3() to report line numbers
														
 
															+				*/
														
 
															+};
														
 
															+
														
 
															+struct text {
														
 
															+	char *tx_fname;		/* the file name */
														
 
															+	struct position *tx_pos;/* list of positions in this file that are
														
 
															+				   part of a chunk; sorted and updated by
														
 
															+				   Pass 2
														
 
															+				*/
														
 
															+	unsigned int tx_start;	/* positions in TokenArray[] for the text */
														
 
															+	unsigned int tx_limit;
														
 
															+	unsigned int tx_nl_start;/* possibly newline pointer for pass2 */
														
 
															+	unsigned int tx_nl_limit;
														
 
															+};
														
 
															+
														
 
															+extern unsigned int MinRunSize;
														
 
															+extern int PageWidth;
														
 
															+extern FILE *OutputFile;
														
 
															+extern FILE *DebugFile;
														
 
															+
														
 
															+extern struct text *Text;		/* Text[], one for each input file */
														
 
															+extern int NumberOfTexts;		/* number of text records */
														
 
															+extern int NumberOfNewTexts;		/* number of new text records */
														
 
															+
														
 
															+extern char *progname;			/* for error reporting */
														
--- a/utils/sim_pasc/sim.html
+++ b/utils/sim_pasc/sim.html
@@ -0,0 +1,116 @@
 
															+<HTML>
														
 
															+<!-- $Id: sim.html,v 1.7 2007/08/27 09:57:35 dick Exp $ -->
														
 
															+<HEAD>
														
 
															+<TITLE>The software and text similarity tester SIM</TITLE>
														
 
															+</HEAD>
														
 
															+
														
 
															+<BODY>
														
 
															+<H1>The software and text similarity tester SIM</H1>
														
 
															+
														
 
															+<H2>
														
 
															+<A HREF="http://www.cs.vu.nl/~dick/">Dick Grune</A>
														
 
															+</H2>
														
 
															+
														
 
															+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/README.1st">SIM</A>
														
 
															+tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp, Miranda,
														
 
															+and natural language.
														
 
															+It is used
														
 
															+<UL>
														
 
															+
														
 
															+<LI>
														
 
															+to detect potentially duplicated code fragments in large software
														
 
															+projects, in program text, in shell scripts and in documentation
														
 
															+</LI>
														
 
															+
														
 
															+<LI>
														
 
															+to detect plagiarism in software projects, educational and otherwise
														
 
															+</LI>
														
 
															+
														
 
															+</UL>
														
 
															+
														
 
															+<P>
														
 
															+SIM 2.19 is available as
														
 
															+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim_2_19.shar">
														
 
															+C sources</A>
														
 
															+and as
														
 
															+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim_2_19.zip">
														
 
															+MSDOS binaries</A>.
														
 
															+It is also available through ftp; the directory is
														
 
															+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester">
														
 
															+ftp.cs.vu.nl:/pub/dick/similarity_tester</A>.
														
 
															+There is a
														
 
															+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim.pdf">
														
 
															+Unix-style manual page</A>.
														
 
															+</P>
														
 
															+
														
 
															+<P>
														
 
															+The software similarity tester is very efficient and allows us to compare
														
 
															+this year's students' work with that collected from many past years (much to
														
 
															+the dismay of some, mostly non-CS, students).
														
 
															+Students are told that their work is going to be compared, but some are
														
 
															+non-believers ...
														
 
															+</P>
														
 
															+
														
 
															+<P>
														
 
															+The output of the similarity tester can be processed by a number of shell
														
 
															+scripts by Matty Huntjens
														
 
															+(<A HREF="http://www.cs.vu.nl/~matty/">[email protected]</A>).
														
 
															+These shell scripts take sim output and produce lists of suspect submissions,
														
 
															+histograms and the like.
														
 
															+The present version of these scripts is very much geared to the local
														
 
															+situation at the
														
 
															+<A HREF="http://www.vu.nl/">VU University Amsterdam</A>,
														
 
															+though; they are low on portability.
														
 
															+</P>
														
 
															+
														
 
															+<P>
														
 
															+We are not afraid that students would try to tune their work to the
														
 
															+similarity tester.
														
 
															+We reckon if they can do that they can also do the exercise.
														
 
															+</P>
														
 
															+
														
 
															+<P>
														
 
															+Since this piece of handicraft does not qualify as research, there are no
														
 
															+international papers on it.
														
 
															+The work was described in Dutch in
														
 
															+Dick Grune,
														
 
															+Matty Huntjens,
														
 
															+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/publications/Het_detecteren_van_kopieen_bij_informatica-practica.ps">
														
 
															+Het detecteren van kopie&euml;n bij informatica-practica</A>,
														
 
															+Informatie,
														
 
															+<STRONG>31</STRONG>,
														
 
															+11,
														
 
															+Nov 1989,
														
 
															+pp. 864-867
														
 
															+(<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/artikel.lit">
														
 
															+lit. ref.</A>)).
														
 
															+An
														
 
															+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/Paper.ps">
														
 
															+English translation
														
 
															+</A>
														
 
															+of the paper is also available.
														
 
															+The ftp directory contains a terse
														
 
															+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/TechnReport">
														
 
															+technical report</A>
														
 
															+about the internal workings of the program.
														
 
															+</P>
														
 
															+
														
 
															+<H5>
														
 
															+<HR>
														
 
															+[<A HREF="CVS.html">Previous</A>]
														
 
															+[<A HREF="mag.html">Next</A>]
														
 
															+[<A HREF="http://www.cs.vu.nl/~dick/dick.html">Personal Page</A>]
														
 
															+[<A HREF="http://www.cs.vu.nl/~dick/">Professional Page</A>]
														
 
															+[<A HREF="http://www.cs.vu.nl/">CS</A>]
														
 
															+[<A HREF="http://www.few.vu.nl/">Faculty</A>]
														
 
															+[<A HREF="http://www.vu.nl/">VU University Amsterdam</A>]
														
 
															+<HR>
														
 
															+</H5>
														
 
															+
														
 
															+<ADDRESS>
														
 
															+The software and text similarity tester SIM / Dick Grune /
														
 
															+<A HREF="mailto:[email protected]">[email protected]</A>
														
 
															+</ADDRESS>
														
 
															+
														
 
															+</BODY>
														
 
															+</HTML>
														
--- a/utils/sim_pasc/sim.txt
+++ b/utils/sim_pasc/sim.txt
@@ -0,0 +1,198 @@
 
															+
														
 
															+
														
 
															+
														
 
															+User Commands                                              SIM(1)
														
 
															+
														
 
															+
														
 
															+
														
 
															+NAME
														
 
															+     sim - find similarities in C, Java, Pascal, Modula-2,  Lisp,
														
 
															+     Miranda or text files
														
 
															+
														
 
															+SYNOPSIS
														
 
															+     sim_c [ -[defFnpsS] -r N -w N -o F ] file ... [ / [ file ...
														
 
															+     ] ]
														
 
															+     sim_c ...
														
 
															+     sim_java ...
														
 
															+     sim_pasc ...
														
 
															+     sim_m2 ...
														
 
															+     sim_lisp ...
														
 
															+     sim_mira ...
														
 
															+     sim_text ...
														
 
															+
														
 
															+DESCRIPTION
														
 
															+     Sim_c reads the C files file ... and  looks  for  pieces  of
														
 
															+     text  that are similar; two pieces of program text are simi-
														
 
															+     lar if they only differ in layout, comment, identifiers  and
														
 
															+     the  contents  of  numbers,  strings and characters.  If any
														
 
															+     runs of sufficient length are found, they  are  reported  on
														
 
															+     standard output; the number of significant tokens in the run
														
 
															+     is given between square brackets.
														
 
															+
														
 
															+     Sim_java does the same for Java, sim_pasc for Pascal, sim_m2
														
 
															+     for  Modula-2,  sim_lisp for Lisp, and sim_mira for Miranda.
														
 
															+     Sim_text works on arbitrary text; it is occasionally  useful
														
 
															+     on shell scripts.
														
 
															+
														
 
															+     The program can be used for finding copied pieces of code in
														
 
															+     purportedly unrelated programs (with -s or -S), or for find-
														
 
															+     ing accidentally duplicated code in  larger  projects  (with
														
 
															+     -f).
														
 
															+
														
 
															+     If a / is present between the input files,  the  latter  are
														
 
															+     divided  into  a  group  of "new" files (before the /) and a
														
 
															+     group of "old" files; if there is no /, all files are "new".
														
 
															+     Old files are never compared to each other.  Since the simi-
														
 
															+     larity tester reads the files several times, it cannot  read
														
 
															+     from standard input.
														
 
															+
														
 
															+     There are the following options:
														
 
															+
														
 
															+     -d   The output is in a diff(1)-like format instead  of  the
														
 
															+          default 2-column format.
														
 
															+
														
 
															+     -e   Each file is compared to each file in  isolation;  this
														
 
															+          will  find all similarities between all texts involved,
														
 
															+          regardless of duplicates.
														
 
															+
														
 
															+     -f   Runs  are   restricted   to   pieces   with   balancing
														
 
															+          parentheses,  to  isolate potential functions (C, Java,
														
 
															+
														
 
															+
														
 
															+
														
 
															+Vrije Universiteit   Last change: 2001/11/13                    1
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+User Commands                                              SIM(1)
														
 
															+
														
 
															+
														
 
															+
														
 
															+          Pascal, Modula-2 and Lisp only).
														
 
															+
														
 
															+     -F   The names of functions in calls are required  to  match
														
 
															+          exactly (C, Java, Pascal, Modula-2 and Lisp only).
														
 
															+
														
 
															+     -n   Similarities found are only summarized, not displayed.
														
 
															+
														
 
															+     -o F The output is written to the file named F.
														
 
															+
														
 
															+     -p   The output is  given  in  similarity  percentages;  see
														
 
															+          below.
														
 
															+
														
 
															+     -r N The minimum run length is set to N (default is N = 24).
														
 
															+
														
 
															+     -s   The contents of a file are not compared to itself (-s =
														
 
															+          not self).
														
 
															+
														
 
															+     -S   The contents of the new files are compared to  the  old
														
 
															+          files only - not between themselves.
														
 
															+
														
 
															+     -w N The page width used is set to N columns (default is N =
														
 
															+          80).
														
 
															+
														
 
															+     The -p option results in lines of the form F consists for  x
														
 
															+     %  of  G  material  meaning that x % of F's text can also be
														
 
															+     found in G.  Note that this relation is not symmetric; it is
														
 
															+     in  fact quite possible for one file to consist for 100 % of
														
 
															+     text from another file, while the other  file  consists  for
														
 
															+     only  1 % of text of the first file, if their lengths differ
														
 
															+     enough.  Note also that the granularity  of  the  recognized
														
 
															+     text is still governed by the -r option or its default.
														
 
															+
														
 
															+     Care has been taken to keep all internal processes linear in
														
 
															+     the  length of the input, with the exception of the matching
														
 
															+     process which is almost linear, using a hash table;  various
														
 
															+     other  tables  are used for speed-up.  If, however, there is
														
 
															+     not enough memory for the  tables,  they  are  discarded  in
														
 
															+     order of unimportance, under which conditions the algorithms
														
 
															+     revert to their quadratic nature.
														
 
															+
														
 
															+AUTHOR
														
 
															+     Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+
														
 
															+BUGS
														
 
															+     Strong periodicity in the input text  (like  a  table  of  N
														
 
															+     almost  identical lines) causes problems.  Sim tries to cope
														
 
															+     with this but cannot avoid giving appr. log N messages about
														
 
															+     it.   The  best  advice is still to take the offending files
														
 
															+     out of the game.
														
 
															+
														
 
															+     Since it uses lex(1) on some systems, it may  dump  core  on
														
 
															+     any   weird   construction  that  overflows  lex's  internal
														
 
															+
														
 
															+
														
 
															+
														
 
															+Vrije Universiteit   Last change: 2001/11/13                    2
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+User Commands                                              SIM(1)
														
 
															+
														
 
															+
														
 
															+
														
 
															+     buffers.
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+Vrije Universiteit   Last change: 2001/11/13                    3
														
 
															+
														
 
															+
														
 
															+
														
--- a/utils/sim_pasc/sortlist.bdy
+++ b/utils/sim_pasc/sortlist.bdy
@@ -0,0 +1,57 @@
 
															+/*
														
 
															+	Module:	Sort Linked Lists
														
 
															+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
														
 
															+	Version:	Tue Sep 17 17:32:33 1991
														
 
															+
														
 
															+Description:
														
 
															+	This is the implementation part of a generic routine that sorts
														
 
															+	linked lists.
														
 
															+
														
 
															+Instantiation:
														
 
															+	See sortlist.spc
														
 
															+*/
														
 
															+
														
 
															+#ifndef	_SORT_EXTERN_DEFINED
														
 
															+static
														
 
															+#endif
														
 
															+void
														
 
															+SORT_NAME(struct SORT_STRUCT **lh) {
														
 
															+	/*	I've  never known that sorting a linked list was this
														
 
															+		complicated; what am I missing?
														
 
															+	*/
														
 
															+	register struct SORT_STRUCT **listhook = lh;
														
 
															+
														
 
															+	while (*listhook) {
														
 
															+		/* 0. the list is not empty -> there must be a smallest one */
														
 
															+		register struct SORT_STRUCT **hsmall;
														
 
															+
														
 
															+		/* 1. find (the pointer to) the smallest element */
														
 
															+		{
														
 
															+			register struct SORT_STRUCT **hook = listhook;
														
 
															+
														
 
															+			/* assume initially that first element is smallest */
														
 
															+			hsmall = hook;
														
 
															+			while (*hook) {
														
 
															+				if (SORT_BEFORE(*hook, *hsmall)) {
														
 
															+					/* revise opinion */
														
 
															+					hsmall = hook;
														
 
															+				}
														
 
															+				hook = &(*hook)->SORT_NEXT;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		/* 2. move the smallest element to front */
														
 
															+		{
														
 
															+			register struct SORT_STRUCT *smallest = *hsmall;
														
 
															+
														
 
															+			/* remove it from the chain */
														
 
															+			*hsmall = smallest->SORT_NEXT;
														
 
															+			/* and insert it before the first element */
														
 
															+			smallest->SORT_NEXT = *listhook;
														
 
															+			*listhook = smallest;
														
 
															+		}
														
 
															+
														
 
															+		/* 3. skip over smallest element */
														
 
															+		listhook = &(*listhook)->SORT_NEXT;
														
 
															+	}
														
 
															+}
														
--- a/utils/sim_pasc/sortlist.spc
+++ b/utils/sim_pasc/sortlist.spc
@@ -0,0 +1,65 @@
 
															+/*
														
 
															+	Module:	Sort Linked Lists
														
 
															+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
														
 
															+	Version:	Tue Sep 17 17:32:33 1991
														
 
															+
														
 
															+Description:
														
 
															+	This is the specification part of a generic routine that sorts linked
														
 
															+	lists. The elements in the list are structs, each of which carries a
														
 
															+	pointer to the next element.
														
 
															+
														
 
															+Instantiation, inline:
														
 
															+	For each struct list type T, specify:
														
 
															+	-	a definition of SORT_STRUCT, the struct name of the linked
														
 
															+		structs
														
 
															+	-	a definition of SORT_NAME, the name of the resulting sort
														
 
															+		routine
														
 
															+	-	a definition of a routine
														
 
															+			int SORT_BEFORE(
														
 
															+				struct SORT_STRUCT *v, struct SORT_STRUCT *w
														
 
															+			)
														
 
															+		which yields non-zero if v is to be sorted before w
														
 
															+	-	a definition of a field selector SORT_NEXT which names the
														
 
															+		field that points to the next struct SORT_STRUCT in the list
														
 
															+	-	#include	"sortlist.bdy"
														
 
															+
														
 
															+Instantiation, separate:
														
 
															+	For each struct list type T, create a file sortT.h which contains at
														
 
															+	least:
														
 
															+	-	a definition of SORT_STRUCT, the struct name of the linked
														
 
															+		structs
														
 
															+	-	a definition of SORT_NAME, the name of the resulting sort
														
 
															+		routine
														
 
															+	-	#include	"sortlist.spc"
														
 
															+
														
 
															+	This file sortT.h is to be included in all files that use the routine
														
 
															+	SORT_NAME.
														
 
															+
														
 
															+	For each struct list type T, create a file sortT.c which contains at
														
 
															+	least:
														
 
															+	-	#include	"sortT.h"
														
 
															+	-	a definition of a routine
														
 
															+			int SORT_BEFORE(
														
 
															+				struct SORT_STRUCT *v, struct SORT_STRUCT *w
														
 
															+			)
														
 
															+		which yields non-zero if v is to be sorted before w
														
 
															+	-	a definition of a field selector SORT_NEXT which names the
														
 
															+		field that points to the next struct SORT_STRUCT in the list
														
 
															+	-	#include	"sortlist.bdy"
														
 
															+
														
 
															+	This file sortT.c compiles into the module object for SORT_STRUCT.
														
 
															+
														
 
															+Specification:
														
 
															+	The module supplies:
														
 
															+	-	void SORT_NAME(struct SORT_STRUCT **listhook)
														
 
															+		where 'listhook' is a pointer to the location that holds the
														
 
															+		pointer to the list to be sorted. Upon return, the list will
														
 
															+		be sorted, and the pointer updated.
														
 
															+		The routine will be defined static when instantiated inline.
														
 
															+
														
 
															+Implementation:
														
 
															+	Linear insert sort:-(.
														
 
															+*/
														
 
															+
														
 
															+extern void SORT_NAME(struct SORT_STRUCT **);
														
 
															+#define	_SORT_EXTERN_DEFINED
														
--- a/utils/sim_pasc/stream.c
+++ b/utils/sim_pasc/stream.c
@@ -0,0 +1,56 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: stream.c,v 2.7 2001/11/08 12:30:32 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<sys/types.h>
														
 
															+#include	<sys/stat.h>
														
 
															+
														
 
															+#include	"system.par"
														
 
															+#include	"token.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"lang.h"
														
 
															+#include	"stream.h"
														
 
															+
														
 
															+static FILE *fopen_regular_file(const char *fname);
														
 
															+
														
 
															+int
														
 
															+OpenStream(const char *fname) {
														
 
															+	int ok;
														
 
															+
														
 
															+	lex_nl_cnt = 1;
														
 
															+	lex_tk_cnt = 0;
														
 
															+	lex_non_ascii_cnt = 0;
														
 
															+
														
 
															+	/* start the lex machine */
														
 
															+	yyin = fopen_regular_file(fname);
														
 
															+	ok = (yyin != 0);
														
 
															+	if (!ok) {
														
 
															+		/* fake a stream, to simplify the rest of the program */
														
 
															+		yyin = fopen(NULLFILE, "r");
														
 
															+	}
														
 
															+	yystart();
														
 
															+	return ok;
														
 
															+}
														
 
															+
														
 
															+static FILE *fopen_regular_file(const char *fname) {
														
 
															+	struct stat buf;
														
 
															+	
														
 
															+	if (stat(fname, &buf) != 0) return 0;
														
 
															+	if ((buf.st_mode & S_IFMT) != S_IFREG) return 0;
														
 
															+	return fopen(fname, "r");
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+NextStreamTokenObtained(void) {
														
 
															+	return yylex();
														
 
															+}
														
 
															+
														
 
															+void
														
 
															+CloseStream(void) {
														
 
															+	if (yyin) {
														
 
															+		fclose(yyin);
														
 
															+		yyin = 0;
														
 
															+	}
														
 
															+}
														
--- a/utils/sim_pasc/stream.h
+++ b/utils/sim_pasc/stream.h
@@ -0,0 +1,17 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: stream.h,v 2.4 1998/02/03 14:28:36 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	Interface of the stream module.
														
 
															+
														
 
															+	Implements the direct interaction with the lexical
														
 
															+	module.  It supplies the routines below.
														
 
															+*/
														
 
															+
														
 
															+#include	"token.h"
														
 
															+
														
 
															+extern int OpenStream(const char *);
														
 
															+extern int NextStreamTokenObtained(void);
														
 
															+extern void CloseStream(void);
														
--- a/utils/sim_pasc/sysidf.mk
+++ b/utils/sim_pasc/sysidf.mk
@@ -0,0 +1,17 @@
 
															+#	I N S T A L L A T I O N   P A R A M E T E R S
														
 
															+
														
 
															+BINDIR =	/home/dick/bin.`arch`
														
 
															+MANDIR =	/home/dick/man/man1
														
 
															+FTPDIR =	/usr/local/ftpd/pub/dick/similarity_tester
														
 
															+
														
 
															+#	C O M P I L A T I O N   P A R A M E T E R S
														
 
															+
														
 
															+EXE =		#
														
 
															+CC =		gcc -pedantic -Wall
														
 
															+LEX =		flex
														
 
															+COPY =		cp -p
														
 
															+ZIP =		zip -o
														
 
															+LINT =		lint -ansi $(SYSTEM)
														
 
															+LINTFLAGS =	-xh
														
 
															+
														
 
															+SYSTEM =	-DUNIX
														
--- a/utils/sim_pasc/sysidf.msdos
+++ b/utils/sim_pasc/sysidf.msdos
@@ -0,0 +1,17 @@
 
															+#	I N S T A L L A T I O N   P A R A M E T E R S
														
 
															+
														
 
															+BINDIR =	/com
														
 
															+MANDIR =	/man
														
 
															+
														
 
															+
														
 
															+#	C O M P I L A T I O N   P A R A M E T E R S
														
 
															+
														
 
															+EXE =		.exe
														
 
															+CC =		gcc -pedantic -Wall
														
 
															+LEX =		flex
														
 
															+COPY =		xcopy
														
 
															+ZIP =		pkzip -ko
														
 
															+ATFILEARGS =	gcc.exe:ar.exe:lint.exe## use DOS at-convention for these
														
 
															+LINT =		lint -ansi $(SYSTEM)
														
 
															+
														
 
															+SYSTEM =	-DMSDOS
														
--- a/utils/sim_pasc/sysidf.unix
+++ b/utils/sim_pasc/sysidf.unix
@@ -0,0 +1,19 @@
 
															+#	I N S T A L L A T I O N   P A R A M E T E R S
														
 
															+
														
 
															+BINDIR =	/home/dick/bin.`arch`
														
 
															+MANDIR =	/home/dick/man/man1
														
 
															+FTPDIR =	/usr/local/ftpd/pub/dick/similarity_tester
														
 
															+FTPFILES =	README.1st READ_ME TechnReport
														
 
															+VERSION =	2_19
														
 
															+
														
 
															+#	C O M P I L A T I O N   P A R A M E T E R S
														
 
															+
														
 
															+EXE =		#
														
 
															+CC =		gcc -pedantic -Wall
														
 
															+LEX =		flex
														
 
															+COPY =		cp -p
														
 
															+ZIP =		zip -o
														
 
															+LINT =		lint -ansi $(SYSTEM)
														
 
															+LINTFLAGS =	-xh
														
 
															+
														
 
															+SYSTEM =	-DUNIX
														
--- a/utils/sim_pasc/system.par
+++ b/utils/sim_pasc/system.par
@@ -0,0 +1,20 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: system.par,v 1.2 2001/09/28 09:03:55 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Operating-system dependent data */
														
 
															+
														
 
															+#ifdef	UNIX
														
 
															+
														
 
															+#define	int32		int		/* type of a 32 bits signed int */
														
 
															+#define	NULLFILE	"/dev/null"
														
 
															+
														
 
															+#endif
														
 
															+
														
 
															+#ifdef	MSDOS		/* GNU gcc */
														
 
															+
														
 
															+#define	int32		int		/* type of a 32 bits signed int */
														
 
															+#define	NULLFILE	"nul"
														
 
															+
														
 
															+#endif
														
--- a/utils/sim_pasc/text.c
+++ b/utils/sim_pasc/text.c
@@ -0,0 +1,236 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: text.c,v 1.2 2001/11/13 12:55:58 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+#include	<malloc.h>
														
 
															+
														
 
															+#include	"debug.par"
														
 
															+#include	"sim.h"
														
 
															+#include	"token.h"
														
 
															+#include	"stream.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"options.h"
														
 
															+#include	"error.h"
														
 
															+#include	"text.h"
														
 
															+
														
 
															+struct newline {
														
 
															+	unsigned char nl_tk_diff;	/* token position difference */
														
 
															+};
														
 
															+
														
 
															+#define	NL_INCR		1000		/* increment of newline buffer size */
														
 
															+
														
 
															+static struct newline *nl_buff;		/* to be filled by malloc */
														
 
															+static unsigned int nl_size;		/* size of nl_buff[] */
														
 
															+static unsigned int nl_free;		/* next free position in nl_buff[] */
														
 
															+
														
 
															+static unsigned int nl_next, nl_limit;	/* nl_buff[] pointers during pass 2 */
														
 
															+
														
 
															+static void store_newline(void);
														
 
															+static void init_nl_buff(void);
														
 
															+
														
 
															+/*							TEXT INTERFACE */
														
 
															+
														
 
															+static unsigned int last_tk_cnt;	/* token count at newline */
														
 
															+static unsigned int last_nl_cnt;	/* nl counter during pass 2 */
														
 
															+
														
 
															+void
														
 
															+InitText(int nfiles) {
														
 
															+	/* allocate the array of text descriptors */
														
 
															+	NumberOfTexts = nfiles;
														
 
															+	Text = (struct text *)
														
 
															+		malloc((unsigned int)(NumberOfTexts*sizeof (struct text)));
														
 
															+	if (!Text) fatal("out of memory");
														
 
															+
														
 
															+	init_nl_buff();
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+OpenText(enum Pass pass, struct text *txt) {
														
 
															+	switch (pass) {
														
 
															+	case First:
														
 
															+		last_tk_cnt = 0;
														
 
															+		if (nl_buff) {
														
 
															+			txt->tx_nl_start = nl_free;
														
 
															+		}
														
 
															+		break;
														
 
															+
														
 
															+	case Second:
														
 
															+		last_tk_cnt = 0;
														
 
															+		if (nl_buff) {
														
 
															+			nl_next = txt->tx_nl_start;
														
 
															+			nl_limit = txt->tx_nl_limit;
														
 
															+			last_nl_cnt = 1;
														
 
															+			lex_nl_cnt = 1;
														
 
															+			lex_tk_cnt = 0;
														
 
															+			return 1;
														
 
															+		}
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return OpenStream(txt->tx_fname);
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+NextTextTokenObtained(enum Pass pass) {
														
 
															+	register int ok = 0;	/* gcc does not understand enum Pass */
														
 
															+
														
 
															+	switch (pass) {
														
 
															+	case First:
														
 
															+		ok = NextStreamTokenObtained();
														
 
															+		if (TOKEN_EQ(lex_token, EOL)) {
														
 
															+			store_newline();
														
 
															+			last_tk_cnt = lex_tk_cnt;
														
 
															+		}
														
 
															+		break;
														
 
															+
														
 
															+	case Second:
														
 
															+		/* get newline info from the buffer or from the file itself */
														
 
															+		if (nl_buff) {
														
 
															+			if (nl_next == nl_limit) {
														
 
															+				ok = 0;
														
 
															+			}
														
 
															+			else {
														
 
															+				struct newline *nl = &nl_buff[nl_next++];
														
 
															+
														
 
															+				lex_nl_cnt = ++last_nl_cnt;
														
 
															+				lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff);
														
 
															+				lex_token = EOL;
														
 
															+				ok = 1;
														
 
															+			}
														
 
															+		}
														
 
															+		else {
														
 
															+			while (	(ok = NextStreamTokenObtained())
														
 
															+			&&	!TOKEN_EQ(lex_token, EOL)
														
 
															+			) {
														
 
															+				/* skip */
														
 
															+			}
														
 
															+		}
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return ok;
														
 
															+}
														
 
															+
														
 
															+void
														
 
															+CloseText(enum Pass pass, struct text *txt) {
														
 
															+	switch (pass) {
														
 
															+	case First:
														
 
															+		if (nl_buff) {
														
 
															+			if (last_tk_cnt != lex_tk_cnt) {
														
 
															+				/* there were tokens after the last newline */
														
 
															+				store_newline();
														
 
															+			}
														
 
															+			txt->tx_nl_limit = nl_free;
														
 
															+		}
														
 
															+		break;
														
 
															+	case Second:
														
 
															+		break;
														
 
															+	}
														
 
															+	CloseStream();
														
 
															+}
														
 
															+
														
 
															+/*							NEWLINE CACHING */
														
 
															+
														
 
															+/*	To speed up pass2 which is interested in token positions at line ends,
														
 
															+	the newline buffer keeps this info from pass1. To reduce the size of
														
 
															+	the newline buffer, the info is kept as the differences of the values
														
 
															+	at consecutive line ends. This allows unsigned chars to be used rather
														
 
															+	than integers.
														
 
															+
														
 
															+	The recording of token position differences at EOL is optional, and
														
 
															+	is switched off if
														
 
															+	-	there is not room enough for the newline buffer.
														
 
															+	-	a difference would not fit in the field in the struct.
														
 
															+	Switching off is done by freeing the buffer and setting nl_buff to 0.
														
 
															+	Anybody using nl_buff should therefore test for nl_buff being zero.
														
 
															+*/
														
 
															+
														
 
															+static void abandon_nl_buff(void);
														
 
															+
														
 
															+static void
														
 
															+init_nl_buff(void) {
														
 
															+	/* Allocate the newline buffer, if possible */
														
 
															+	nl_size = 0 + NL_INCR;
														
 
															+	nl_buff = (option_set('x') ? 0 :
														
 
															+		(struct newline *)malloc(sizeof (struct newline) * nl_size)
														
 
															+	);
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+store_newline(void) {
														
 
															+	if (!nl_buff) return;
														
 
															+
														
 
															+	if (nl_free == nl_size) {
														
 
															+		/* allocated array is full; try to increase its size */
														
 
															+		unsigned int new_size = nl_size + NL_INCR;
														
 
															+		struct newline *new_buff = (struct newline *)realloc(
														
 
															+			(char *)nl_buff,
														
 
															+			sizeof (struct newline) * new_size
														
 
															+		);
														
 
															+
														
 
															+		if (!new_buff) {
														
 
															+			/* we failed */
														
 
															+			abandon_nl_buff();
														
 
															+			return;
														
 
															+		}
														
 
															+		nl_buff = new_buff, nl_size = new_size;
														
 
															+	}
														
 
															+
														
 
															+	/* now we are sure there is room enough */
														
 
															+	{
														
 
															+		register struct newline *nl = &nl_buff[nl_free++];
														
 
															+		register unsigned int tk_diff = lex_tk_cnt - last_tk_cnt;
														
 
															+
														
 
															+		nl->nl_tk_diff = tk_diff;
														
 
															+		if (nl->nl_tk_diff != tk_diff) {
														
 
															+			/* tk_diff does not fit in nl_tk_diff */
														
 
															+			abandon_nl_buff();
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+abandon_nl_buff(void) {
														
 
															+	if (nl_buff) {
														
 
															+		free((char *)nl_buff);
														
 
															+		nl_buff = 0;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+#ifdef	DB_NL_BUFF
														
 
															+
														
 
															+void
														
 
															+db_print_nl_buff(unsigned int start, unsigned int limit) {
														
 
															+	int i;
														
 
															+
														
 
															+	fprintf(DebugFile, "\n**** DB_NL_BUFF ****\n");
														
 
															+	if (!nl_buff) {
														
 
															+		fprintf(DebugFile, ">>>> NO NL_BUFF\n\n");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	if (start > nl_free) {
														
 
															+		fprintf(DebugFile, ">>>> start (%u) > nl_free (%u)\n\n",
														
 
															+			start, nl_free
														
 
															+		);
														
 
															+		return;
														
 
															+	}
														
 
															+	if (limit > nl_free) {
														
 
															+		fprintf(DebugFile, ">>>> limit (%u) > nl_free (%u)\n\n",
														
 
															+			limit, nl_free
														
 
															+		);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	fprintf(DebugFile, "nl_buff: %u entries:\n", nl_free);
														
 
															+	for (i = start; i < limit; i++) {
														
 
															+		struct newline *nl = &nl_buff[i];
														
 
															+
														
 
															+		fprintf(DebugFile, "nl_tk_diff = %d\n", nl->nl_tk_diff);
														
 
															+	}
														
 
															+	fprintf(DebugFile, "\n");
														
 
															+}
														
 
															+
														
 
															+#endif	/* DB_NL_BUFF */
														
--- a/utils/sim_pasc/text.h
+++ b/utils/sim_pasc/text.h
@@ -0,0 +1,20 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: text.h,v 1.2 2001/09/28 09:03:56 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*	Implements the access to the lexical scanner.
														
 
															+	Additionally, the module tries to save newline information,
														
 
															+	anticipating a second scan which is interested in this
														
 
															+	information only.
														
 
															+*/
														
 
															+
														
 
															+extern void InitText(int nfiles);
														
 
															+enum Pass {First, Second};
														
 
															+extern int OpenText(enum Pass pass, struct text *txt);
														
 
															+extern int NextTextTokenObtained(enum Pass pass);
														
 
															+extern void CloseText(enum Pass pass, struct text *txt);
														
 
															+
														
 
															+#ifdef	DB_NL_BUFF
														
 
															+extern void db_print_nl_buff(unsigned int start, unsigned int limit);
														
 
															+#endif
														
--- a/utils/sim_pasc/textlang.l
+++ b/utils/sim_pasc/textlang.l
@@ -0,0 +1,72 @@
 
															+%{
														
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: textlang.l,v 1.3 2007/08/29 09:10:36 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	Text front end for the similarity tester.
														
 
															+*/
														
 
															+
														
 
															+#include	"language.h"
														
 
															+#include	"token.h"
														
 
															+#include	"idf.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"lang.h"
														
 
															+
														
 
															+/* Language-dependent Code */
														
 
															+
														
 
															+void
														
 
															+InitLanguage(void) {
														
 
															+}
														
 
															+
														
 
															+/*ARGSUSED*/
														
 
															+int
														
 
															+MayBeStartOfRun(TOKEN tk) {
														
 
															+	/* any token is acceptable */
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+/*ARGSUSED*/
														
 
															+unsigned int
														
 
															+CheckRun(const TOKEN *str, unsigned int size) {
														
 
															+	/* any run is acceptable */
														
 
															+	return size;
														
 
															+}
														
 
															+
														
 
															+%}
														
 
															+
														
 
															+%option nounput
														
 
															+%option never-interactive
														
 
															+
														
 
															+Layout		([ \t\r\f])
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+[^ \t\n]+	{			/* a word */
														
 
															+		/*	a word is defined as anything not containing
														
 
															+			layout
														
 
															+		*/
														
 
															+		return_tk(idf_hashed(yytext));
														
 
															+	}
														
 
															+
														
 
															+\n	{				/* count newlines */
														
 
															+		return_eol();
														
 
															+	}
														
 
															+
														
 
															+{Layout}	{			/* ignore layout */
														
 
															+	}
														
 
															+
														
 
															+%%
														
 
															+
														
 
															+/* Language-INdependent Code */
														
 
															+
														
 
															+void
														
 
															+yystart(void) {
														
 
															+	BEGIN INITIAL;
														
 
															+}
														
 
															+
														
 
															+int
														
 
															+yywrap(void) {
														
 
															+	return 1;
														
 
															+}
														
--- a/utils/sim_pasc/token.c
+++ b/utils/sim_pasc/token.c
@@ -0,0 +1,44 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: token.c,v 2.4 2001/11/13 12:55:58 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	Token interface, implementation part.
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+
														
 
															+#include	"token.h"
														
 
															+
														
 
															+void
														
 
															+print_token(FILE *ofile, TOKEN tk) {
														
 
															+	/*	prints a token, in two characters:
														
 
															+			normal char		meta (bit 8 set)
														
 
															+			^A	cntl		$A	meta-cntl
														
 
															+			 A	printable	#A	meta
														
 
															+			^?	DEL		$?	meta-DEL
														
 
															+	*/
														
 
															+	register int ch =   TOKEN2int(tk) & 0177;
														
 
															+	register int meta = TOKEN2int(tk) & 0200;
														
 
															+
														
 
															+	if (' ' <= ch && ch <= '~') {
														
 
															+		fprintf(ofile, "%c%c", (meta ? '#' : ' '), ch);
														
 
															+	}
														
 
															+	else {
														
 
															+		fprintf(ofile, "%c%c",
														
 
															+			(meta ? '$' : '^'),
														
 
															+			(ch == 0177 ? '?' : ch + '@')
														
 
															+		);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+#ifdef	TESTTOKEN
														
 
															+
														
 
															+int
														
 
															+TOKEN_EQ(TOKEN t1, TOKEN t2) {
														
 
															+	/* to make sure TOKEN_EQ is indeed called with two TOKEN parameters */
														
 
															+	return TOKEN2int(t1) == TOKEN2int(t2);
														
 
															+}
														
 
															+
														
 
															+#endif	/* TESTTOKEN */
														
--- a/utils/sim_pasc/token.h
+++ b/utils/sim_pasc/token.h
@@ -0,0 +1,52 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: token.h,v 2.4 2001/11/13 12:55:59 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+/*
														
 
															+	Token interface.
														
 
															+	Since the definition of a token has been a continual source of
														
 
															+	problems, it is now defined as an Abstract Data Type.
														
 
															+	To allow stronger type checking, there is a special version for use
														
 
															+	by lint.
														
 
															+*/
														
 
															+
														
 
															+#include	<stdio.h>
														
 
															+
														
 
															+#ifndef	TOKEN
														
 
															+
														
 
															+#ifdef	lint
														
 
															+#define	TESTTOKEN
														
 
															+#endif
														
 
															+
														
 
															+#ifdef	TESTTOKEN				/* strict version */
														
 
															+
														
 
															+struct cccc {
														
 
															+	int cccc;
														
 
															+};
														
 
															+
														
 
															+typedef struct cccc *lintTOKEN;
														
 
															+#define	TOKEN		lintTOKEN
														
 
															+#define	TOKEN2int(c)	((int)(c))
														
 
															+#define	int2TOKEN(i)	((TOKEN)(i))
														
 
															+extern int TOKEN_EQ(TOKEN t1, TOKEN t2);
														
 
															+
														
 
															+#else						/* production version */
														
 
															+
														
 
															+#define	TOKEN		unsigned char
														
 
															+#define	TOKEN2int(c)	((c)&0377)
														
 
															+#define	int2TOKEN(i)	((TOKEN)(i))
														
 
															+#define	TOKEN_EQ(t1,t2)	(TOKEN2int(t1) == TOKEN2int(t2))
														
 
															+
														
 
															+#endif	/* TESTTOKEN */
														
 
															+
														
 
															+#endif	/* TOKEN */
														
 
															+
														
 
															+/* Macros for the composition of tokens */
														
 
															+#define	NORM(ch)	int2TOKEN((ch)&0377)
														
 
															+#define	CTRL(ch)	int2TOKEN((ch)&0037)
														
 
															+#define	META(ch)	int2TOKEN((ch)|0200)
														
 
															+#define	MTCT(ch)	int2TOKEN(((ch)&0037)|0200)
														
 
															+#define	NOTOKEN		int2TOKEN(0)
														
 
															+
														
 
															+extern void print_token(FILE *ofile, TOKEN tk);	/* in two characters */
														
--- a/utils/sim_pasc/tokenarray.c
+++ b/utils/sim_pasc/tokenarray.c
@@ -0,0 +1,52 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: tokenarray.c,v 1.2 2001/11/13 12:55:59 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	<malloc.h>
														
 
															+
														
 
															+#include	"error.h"
														
 
															+#include	"lex.h"
														
 
															+#include	"tokenarray.h"
														
 
															+
														
 
															+#define	TK_INCR		10000		/* increment of token array size */
														
 
															+
														
 
															+TOKEN *TokenArray;			/* to be filled by malloc */
														
 
															+static unsigned int tk_size;		/* size of TokenArray[] */
														
 
															+static unsigned int tk_free;		/* next free position in TokenArray[] */
														
 
															+
														
 
															+void
														
 
															+InitTokenArray(void) {
														
 
															+	tk_size = TK_INCR;
														
 
															+	TokenArray = (TOKEN *)malloc(sizeof (TOKEN) * tk_size);
														
 
															+	if (!TokenArray) fatal("out of memory");
														
 
															+	tk_free = 1;		/* don't use position 0 */
														
 
															+}
														
 
															+
														
 
															+void
														
 
															+StoreToken(void) {
														
 
															+	if (tk_free == tk_size) {
														
 
															+		/* allocated array is full; try to increase its size */
														
 
															+		unsigned int new_size = tk_size + TK_INCR;
														
 
															+		register TOKEN *new_array = (TOKEN *)realloc(
														
 
															+			(char *)TokenArray,
														
 
															+			sizeof (TOKEN) * new_size
														
 
															+		);
														
 
															+
														
 
															+		if (new_size < tk_free)
														
 
															+			fatal("internal error: TK_INCR causes numeric overflow");
														
 
															+		if (!new_array) {
														
 
															+			/* we failed */
														
 
															+			fatal("out of memory");
														
 
															+		}
														
 
															+		TokenArray = new_array, tk_size = new_size;
														
 
															+	}
														
 
															+
														
 
															+	/* now we are sure there is room enough */
														
 
															+	TokenArray[tk_free++] = lex_token;
														
 
															+}
														
 
															+
														
 
															+unsigned int
														
 
															+TextLength(void) {
														
 
															+	return tk_free;
														
 
															+}
														
--- a/utils/sim_pasc/tokenarray.h
+++ b/utils/sim_pasc/tokenarray.h
@@ -0,0 +1,13 @@
 
															+/*	This file is part of the software similarity tester SIM.
														
 
															+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
														
 
															+	$Id: tokenarray.h,v 1.1 2001/09/28 09:03:42 dick Exp $
														
 
															+*/
														
 
															+
														
 
															+#include	"token.h"
														
 
															+
														
 
															+/* Interface for the token storage */
														
 
															+extern void InitTokenArray(void);
														
 
															+extern void StoreToken(void);
														
 
															+extern unsigned int TextLength(void);	/* also first free token position */
														
 
															+extern TOKEN *TokenArray;
														
 
															+