Browse Source

* source code similarity tester (import of original 2.21 sources available
at http://www.cs.vu.nl/pub/dick/similarity_tester/)

git-svn-id: trunk@9286 -

Jonas Maebe 17 years ago
parent
commit
30e4da99da
67 changed files with 6795 additions and 0 deletions
  1. 66 0
      .gitattributes
  2. 57 0
      utils/sim_pasc/Answers
  3. 580 0
      utils/sim_pasc/ChangeLog
  4. 31 0
      utils/sim_pasc/LICENSE.txt
  5. 566 0
      utils/sim_pasc/Makefile
  6. 34 0
      utils/sim_pasc/READ.ME
  7. 68 0
      utils/sim_pasc/README.1st
  8. 52 0
      utils/sim_pasc/READ_ME
  9. 214 0
      utils/sim_pasc/TechnReport
  10. 70 0
      utils/sim_pasc/add_run.c
  11. 19 0
      utils/sim_pasc/add_run.h
  12. 186 0
      utils/sim_pasc/aiso.bdy
  13. 102 0
      utils/sim_pasc/aiso.spc
  14. 135 0
      utils/sim_pasc/algollike.c
  15. 27 0
      utils/sim_pasc/algollike.h
  16. 252 0
      utils/sim_pasc/clang.l
  17. 198 0
      utils/sim_pasc/compare.c
  18. 11 0
      utils/sim_pasc/compare.h
  19. 20 0
      utils/sim_pasc/debug.par
  20. 16 0
      utils/sim_pasc/error.c
  21. 6 0
      utils/sim_pasc/error.h
  22. 386 0
      utils/sim_pasc/hash.c
  23. 12 0
      utils/sim_pasc/hash.h
  24. 67 0
      utils/sim_pasc/idf.c
  25. 31 0
      utils/sim_pasc/idf.h
  26. 270 0
      utils/sim_pasc/javalang.l
  27. 32 0
      utils/sim_pasc/lang.h
  28. 17 0
      utils/sim_pasc/language.h
  29. 16 0
      utils/sim_pasc/lex.c
  30. 19 0
      utils/sim_pasc/lex.h
  31. 123 0
      utils/sim_pasc/lisplang.l
  32. 319 0
      utils/sim_pasc/m2lang.l
  33. 131 0
      utils/sim_pasc/miralang.l
  34. 123 0
      utils/sim_pasc/options.c
  35. 20 0
      utils/sim_pasc/options.h
  36. 256 0
      utils/sim_pasc/pascallang.l
  37. 119 0
      utils/sim_pasc/pass1.c
  38. 9 0
      utils/sim_pasc/pass1.h
  39. 154 0
      utils/sim_pasc/pass2.c
  40. 9 0
      utils/sim_pasc/pass2.h
  41. 356 0
      utils/sim_pasc/pass3.c
  42. 7 0
      utils/sim_pasc/pass3.h
  43. 115 0
      utils/sim_pasc/percentages.c
  44. 7 0
      utils/sim_pasc/percentages.h
  45. 11 0
      utils/sim_pasc/runs.c
  46. 33 0
      utils/sim_pasc/runs.h
  47. 8 0
      utils/sim_pasc/settings.par
  48. 176 0
      utils/sim_pasc/sim.1
  49. 149 0
      utils/sim_pasc/sim.c
  50. 39 0
      utils/sim_pasc/sim.h
  51. 116 0
      utils/sim_pasc/sim.html
  52. 198 0
      utils/sim_pasc/sim.txt
  53. 57 0
      utils/sim_pasc/sortlist.bdy
  54. 65 0
      utils/sim_pasc/sortlist.spc
  55. 56 0
      utils/sim_pasc/stream.c
  56. 17 0
      utils/sim_pasc/stream.h
  57. 17 0
      utils/sim_pasc/sysidf.mk
  58. 17 0
      utils/sim_pasc/sysidf.msdos
  59. 19 0
      utils/sim_pasc/sysidf.unix
  60. 20 0
      utils/sim_pasc/system.par
  61. 236 0
      utils/sim_pasc/text.c
  62. 20 0
      utils/sim_pasc/text.h
  63. 72 0
      utils/sim_pasc/textlang.l
  64. 44 0
      utils/sim_pasc/token.c
  65. 52 0
      utils/sim_pasc/token.h
  66. 52 0
      utils/sim_pasc/tokenarray.c
  67. 13 0
      utils/sim_pasc/tokenarray.h

+ 66 - 0
.gitattributes

@@ -8965,6 +8965,72 @@ utils/ptop.pp svneol=native#text/plain
 utils/ptopu.pp svneol=native#text/plain
 utils/ptopu.pp svneol=native#text/plain
 utils/rmcvsdir.pp svneol=native#text/plain
 utils/rmcvsdir.pp svneol=native#text/plain
 utils/rstconv.pp svneol=native#text/plain
 utils/rstconv.pp svneol=native#text/plain
+utils/sim_pasc/Answers svneol=native#text/plain
+utils/sim_pasc/ChangeLog svneol=native#text/plain
+utils/sim_pasc/LICENSE.txt svneol=native#text/plain
+utils/sim_pasc/Makefile svneol=native#text/plain
+utils/sim_pasc/READ.ME svneol=native#text/plain
+utils/sim_pasc/README.1st svneol=native#text/plain
+utils/sim_pasc/READ_ME svneol=native#text/plain
+utils/sim_pasc/TechnReport svneol=native#text/plain
+utils/sim_pasc/add_run.c svneol=native#text/plain
+utils/sim_pasc/add_run.h svneol=native#text/plain
+utils/sim_pasc/aiso.bdy svneol=native#text/plain
+utils/sim_pasc/aiso.spc svneol=native#text/plain
+utils/sim_pasc/algollike.c svneol=native#text/plain
+utils/sim_pasc/algollike.h svneol=native#text/plain
+utils/sim_pasc/clang.l svneol=native#text/plain
+utils/sim_pasc/compare.c svneol=native#text/plain
+utils/sim_pasc/compare.h svneol=native#text/plain
+utils/sim_pasc/debug.par svneol=native#text/plain
+utils/sim_pasc/error.c svneol=native#text/plain
+utils/sim_pasc/error.h svneol=native#text/plain
+utils/sim_pasc/hash.c svneol=native#text/plain
+utils/sim_pasc/hash.h svneol=native#text/plain
+utils/sim_pasc/idf.c svneol=native#text/plain
+utils/sim_pasc/idf.h svneol=native#text/plain
+utils/sim_pasc/javalang.l svneol=native#text/plain
+utils/sim_pasc/lang.h svneol=native#text/plain
+utils/sim_pasc/language.h svneol=native#text/plain
+utils/sim_pasc/lex.c svneol=native#text/plain
+utils/sim_pasc/lex.h svneol=native#text/plain
+utils/sim_pasc/lisplang.l svneol=native#text/plain
+utils/sim_pasc/m2lang.l svneol=native#text/plain
+utils/sim_pasc/miralang.l svneol=native#text/plain
+utils/sim_pasc/options.c svneol=native#text/plain
+utils/sim_pasc/options.h svneol=native#text/plain
+utils/sim_pasc/pascallang.l svneol=native#text/plain
+utils/sim_pasc/pass1.c svneol=native#text/plain
+utils/sim_pasc/pass1.h svneol=native#text/plain
+utils/sim_pasc/pass2.c svneol=native#text/plain
+utils/sim_pasc/pass2.h svneol=native#text/plain
+utils/sim_pasc/pass3.c svneol=native#text/plain
+utils/sim_pasc/pass3.h svneol=native#text/plain
+utils/sim_pasc/percentages.c svneol=native#text/plain
+utils/sim_pasc/percentages.h svneol=native#text/plain
+utils/sim_pasc/runs.c svneol=native#text/plain
+utils/sim_pasc/runs.h svneol=native#text/plain
+utils/sim_pasc/settings.par svneol=native#text/plain
+utils/sim_pasc/sim.1 svneol=native#text/plain
+utils/sim_pasc/sim.c svneol=native#text/plain
+utils/sim_pasc/sim.h svneol=native#text/plain
+utils/sim_pasc/sim.html svneol=native#text/plain
+utils/sim_pasc/sim.txt svneol=native#text/plain
+utils/sim_pasc/sortlist.bdy svneol=native#text/plain
+utils/sim_pasc/sortlist.spc svneol=native#text/plain
+utils/sim_pasc/stream.c svneol=native#text/plain
+utils/sim_pasc/stream.h svneol=native#text/plain
+utils/sim_pasc/sysidf.mk svneol=native#text/plain
+utils/sim_pasc/sysidf.msdos svneol=native#text/plain
+utils/sim_pasc/sysidf.unix svneol=native#text/plain
+utils/sim_pasc/system.par svneol=native#text/plain
+utils/sim_pasc/text.c svneol=native#text/plain
+utils/sim_pasc/text.h svneol=native#text/plain
+utils/sim_pasc/textlang.l svneol=native#text/plain
+utils/sim_pasc/token.c svneol=native#text/plain
+utils/sim_pasc/token.h svneol=native#text/plain
+utils/sim_pasc/tokenarray.c svneol=native#text/plain
+utils/sim_pasc/tokenarray.h svneol=native#text/plain
 utils/simulator/Makefile svneol=native#text/plain
 utils/simulator/Makefile svneol=native#text/plain
 utils/simulator/Makefile.fpc svneol=native#text/plain
 utils/simulator/Makefile.fpc svneol=native#text/plain
 utils/simulator/alphasim.pas svneol=native#text/plain
 utils/simulator/alphasim.pas svneol=native#text/plain

+ 57 - 0
utils/sim_pasc/Answers

@@ -0,0 +1,57 @@
+		The software and text similarity tester SIM
+
+SIM tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp,
+Miranda, and natural language.  It is used
+
+- to detect potentially duplicated code fragments in large software projects,
+	in program text but also in shell scripts and documentation;
+- to detect plagiarism in software projects, educational and otherwise.
+
+SIM is available through ftp.  The directory
+
+	ftp.cs.vu.nl:pub/dick/similarity_tester
+
+contains the sources (in C) and the MSDOS .EXEs.
+
+The software similarity tester is very efficient and allows us to compare
+this year's students' work with that collected from many past years (much to
+the dismay of some, mostly non-CS, students).  Students are told in advance
+that their work is going to be compared, but some are non-believers ...
+
+The output of the similarity tester can be processed by a number of shell
+scripts by Matty Huntjens.  These shell scripts take sim output and produce
+lists of suspect submissions, histograms and the like.
+The present version of these scripts is very much geared to the local situation
+at the Vrije Universiteit, though; they are low on portability.
+Matty Huntjens' email address is [email protected].
+
+We are not afraid that students would try to tune their work to the
+similarity tester.  We reckon if they can do that they can also do the
+exercise.
+
+Since this piece of handicraft does not qualify as research, there are no
+international papers on it.  A paper, titled `Detecting copied submissions in
+computer science lab work', was published in a local (i.e. Dutch) computer
+science journal:
+
+%A Dick Grune
+%A Matty Huntjens
+%T Het detecteren van kopie\(:en bij informatica-practica
+%J Informatie (in Dutch)
+%V 31
+%N 11
+%D Nov 1989
+%P 864-867
+
+The ftp directory contains a terse technical report about the internal
+working of the program.
+
+					Dick Grune
+					Vrije Universiteit
+					de Boelelaan 1081
+					1081 HV  Amsterdam
+					the Netherlands
+					[email protected]
+					+31 20 444 7744
+----------------------------------------------------------------
+With infinitely many exceptions, what you do makes no difference.

+ 580 - 0
utils/sim_pasc/ChangeLog

@@ -0,0 +1,580 @@
+2007-08-23  Dick Grune  <[email protected]>
+	LICENSE.txt added.
+
+2006-11-27  Dick Grune  <[email protected]>
+	Removal of setbuff() for compatibility.
+
+2005-01-17  Dick Grune  <[email protected]>
+	Corrections by Jerry James <[email protected]>; ANSIizing, etc.
+
+2004-08-05  Dick Grune  <[email protected]>
+	Finished the 'percentage' option.
+
+08-Nov-2001	Dick Grune
+	Begun to add a 'percentage' option, which will express the
+	similarity between two files in percents.
+
+27-Sep-2001	Dick Grune
+	Split add_run() off from compare.c into add_run.c, to accomodate
+	different add_run()s, for different types of processing.
+
+27-Nov-1998	Dick Grune
+	Installed a Miranda version supplied by Emma Norling ([email protected])
+
+23-Feb-1998	Dick Grune
+	Renamed text.l to textlang.l for uniformity and to make room for
+	a possible module text.[ch].
+
+	Isolated a module for handling the token array from buff.[ch] to
+	tokenarray.[ch], and renamed buff.[ch] to text.[ch].
+
+23-Feb-1998	Dick Grune
+	There is probably not much point in abandoning the nl_buff list
+	when running out of memory for TokenArray[]: each token costs 1
+	byte for the token and 4 bytes for the entry in
+	forward_references[], a total of 5 bytes.  There are about 3
+	tokens to a line, together requiring 15 bytes, plus 1 byte in
+	nl_buff yields 16 bytes.  So releasing nl_buff frees only 1/16 =
+	6.7 % of memeory.
+
+	Since the code is a bother, I removed it.  Note that nl_buff is
+	still abandoned when the number of tokens in a line does not fit
+	in one unsigned char (but that is not very likely to happen).
+
+	
+21-Feb-1998	Dick Grune
+	Printing got into an infinite loop when the last line of the
+	input was not terminated by a newline AND contained tokens that
+	were included in a matching run.
+	This was due to a double bug: 1. the non-terminated line was not
+	registered properly in NextTextTokenObtained() / CloseText(),
+	and 2. the loop in pass 2 which sets the values of
+	pos->ps_nl_cnt was terminated prematurely when the file turned
+	out to be shorter than the list of pos-es indicated.
+	Both bugs were corrected, the first by supplying an extra
+	newline in CloseText() when one is found missing, and the second
+	by rewriting the list-parallel loop in pass 2.
+
+02-Feb-1998	Dick Grune
+	Pascal does not differentiate between strings and characters
+	(strings of one character); this difference has been removed
+	from pascallang.l.
+
+22-Jan-1998	Dick Grune
+	Detection of non-ASCII characters added.  Since the lexical
+	analyser itself generates non-ASCII characters, the test must occur
+	earlier.  We could replace the input routine of lex by a
+	checking routine, but with several lex-es going around, we want
+	a more lex-independent solution.  To allow each language its own
+	restrictions about non-ASCII characters, the check is
+	implemented in the *lang.l files.
+
+28-Nov-1997	Dick Grune
+	Changed the name of the C similarity tester 'sim' to 'sim_c', for
+	uniformity with sim_java, etc.
+
+23-Nov-1997	Dick Grune
+	Java version finished; checked by Matty Huntjens and crew.
+
+24-Jun-1997	Dick Grune
+	Started on a Java version, by copying the C version.
+
+22-Jun-1997	Dick Grune
+	Modern lexical analysers, among which flex, read the entire input into
+	a buffer before they issue the first token.  As a result, ftell() no
+	longer gives a usable indication of the position of a token in a file.
+	This pulls the rug from under the nl_buff mechanism in buff.c, which
+	is removed.  We loose a valuable optimization this way, but there just
+	seems to be no way to keep it.
+
+	Note that this has nothing to do with the problem in MS-DOS of
+	character count and fseek position not being synchronized.  That
+	problem has been solved on June 14, 1991 (which see) and the code has
+	been running OK since.
+
+18-Jun-1997	Dick Grune
+	The thought has occurred to use McCreight's linear longest common
+	substring algorithm rather than the existing algorithm, which has a
+	small quadratic component.  There are a couple of problems with this:
+	1.	We need the longest >non-overlapping< common substring;
+		McCreight provides just the longest.  It is not at all clear
+		how to modify the algorithm.
+	2.	Once we have found our LCS, we want to find the
+		one-but-longest; it is far from obvious how to do that in
+		McCreight's algorithm.
+	3.	Once we have found our LCS, we want to take one of its
+		copies out of the game, to suppress duplicate messages.
+		Again, it is difficult to see how to do that, without
+		redoing all the calculations.
+	4.	McCreight's algorithm seems to require about two binary
+		tree nodes per token, say 8 bytes, which is double we
+		use now.
+
+17-Jun-1997	Dick Grune
+	Did some experimenting with the hash function; it is still
+	pretty bad: the simple-minded second sweep through
+	forward_references easily removes another 80-99% of false hits.
+	Next, a third sweep that does a full comparison will remove another
+	large percentage.
+	
+	So I have left in the second sweep in all cases.
+	
+	There are a couple of questions here:
+	1. Can we find a better hash function, or will we forever need a
+		second sweep?
+	2. Does it actually matter, or will we loose on more expensive
+		hashing what we gain by having a better set of forward
+		references in compare.c?
+
+
+16-Jun-1997	Dick Grune
+	Cleaned up sim.h and renamed aiso.[ch] to runs.[ch] since they
+	are instantiations of the aiso module concerned with runs.
+	Aiso.[spc|bdy] stays aiso.[spc|bdy], of course.
+
+16-Jun-1997	Dick Grune
+	Redid largest_function() in algollike.c.
+	Corrected bug in CheckRun; it now always removes NonFinals from
+	the end, even when it has first applied largest_function().
+
+15-Jun-1997	Dick Grune
+	Reorganized the layers around the input file.  There were and
+	still are three layers: lang, stream and buff.
+
+	Since the lex_X variables are hoisted unchanged through the levels
+	lang, stream, and buff, to be used by pass1, pass2, etc., they
+	have to be placed in a module of their own.
+
+	The token-providing module 'lang' has three interfaces:
+	-	lang.h, which provides access to the lowest-level token
+			routines, to be used by the next level.
+	-	lex.h, which provides the lex variables, to be used by
+			all and sundry.
+	-	language.h, which provides language-specific info about
+			tokens, concerning their suitability as initial
+			and final tokens, to be used by higher levels.
+			
+	This structure is not satisfactory, but it is also unreasonable
+	to combine them in one interface.
+
+	There is no single lang.c; rather it is represented by the
+	various Xlang.c files generated from the Xlang.l files.
+
+14-Jun-1997	Dick Grune
+	Added a Makefile zip entry to parallel the shar entry.
+
+13-Jun-1997	Dick Grune
+	A number of simplifications, in view of better software and bigger
+	machines:
+	-	Removed good_realloc from hash.c; I don't think there are
+		any bad reallocs left.
+	-	Removed the option to run without forward_references.
+		On a 16Mb machine this means you have at least 2M tokens;
+		using a quadratic algorithm will take 4*10^6 sec. at an
+		impossible rate of 1M actions/sec., which is some 50 days.
+		Forget it.
+	-	Renamed lang() to print_stream(), and incorporated it in sim.c
+	-	Removed the MSDOS subdirectory mechanism in the Makefile.
+	-	Removed the funny and sneaky double parameter expansion in
+		the call of idf_in_list().
+
+12-Jun-1997	Dick Grune
+	Converted to ANSI C.  Removed cport.h.
+
+09-Jan-1995	Dick Grune
+	Decided not to do directories: they usually contain extraneous
+	files and doing sim * is simple enough anyway.
+
+09-Sep-1994	Dick Grune
+	Added system.h to cater for the (few) differences between Unix and DOS.
+	The #define int32 is also supplied there.
+
+05-Sep-1994	Dick Grune
+	Added many prototype declarations using cport.h.
+	Added a depend entry to the Makefile.
+
+31-Aug-1994	Dick Grune
+	All these changes require a 32 bit integer; introduced a #define
+	int32, set from the command line in the Makefile.
+
+25-Aug-1994	Dick Grune
+	It turned out that one of the most often called routines was .rem,
+	from idf_hashed() in idf.c.  Moving the % out of the loop chafed off
+	another 6% and reduced the time to 18.4 sec.
+
+19-Aug-1994	Dick Grune
+	With very large files (e.g., concatenated /usr/man/man1/*) the fixed
+	built-in hash table size of 10639 is no longer satisfactory.  Hash.c
+	now finds a prime about 8 times smaller than the text_size to use
+	for hash table size; this achieves optimal speed-up without gobbling
+	up too much memory.  Reduced the time for the above file from 30.2
+	sec. to 19.6 sec.
+	For checking, the same test was run with all hashing off; it took
+	20h 27m 19s = 73639 sec.  But it worked.
+
+11-Aug-1994	Dick Grune
+	For large values of MinRunSize (>1000) a large part of the time
+	(>two-thirds) was spent in calculating the hash values for each
+	position in the input, since the cost of this calculation was
+	proportional to MinRunSize.  We now sample a maximum of 24 tokens
+	from the input string to calculate the hash value, and avoid
+	overflow.  On my workstation, this reduces the time for
+		sim_text -r 1000 -n /usr/man/man1/*
+	from 60 sec to 21 sec.
+
+30-Jun-1992	Dick Grune,kamer R4.40,telef. 5778
+	There was an amazing bug in buff.c where NextTextToken() for pass 2
+	omitted to set lex_token to EOL when retrieving newline info from
+	nl_buff. Worked until now!?!
+
+23-Sep-1991	Dick Grune
+	Cport.h introduced, CONST and *.spc only.
+
+17-Sep-1991	Dick Grune
+	The position-sorting routine in pass2.c has been made into a
+	separate generic module.
+
+14-Jun-1991	Dick Grune ([email protected]) at dick.cs.vu.nl
+	Replaced the determination of the input position through counting
+	input characters by calls of ftell(); this is cleaner and the other
+	method will never work on MSDOS.
+
+30-May-1989	Dick Grune (dick) at dick
+	Replaced the old top-100 module (which had been extended to top-10000
+	already anyway) by the new aiso (arbitrary-in sorted-out) module.
+	This caused a considerable speed-up on the Mod2 test bed:
+		 %time  cumsecs  #call  ms/call  name
+		  17.9    99.20   7209    13.76  _InsertTop
+		   0.3     1.37   7209     0.19  _InsertAiso
+	It turns out that malloc() is not a serious problem, so no special
+	version for the aiso module is required.
+
+23-May-1989	Dick Grune (dick) at dick
+	No more uncommented comment at the end of preprocessor lines, to
+	conform to ANSI C.
+
+23-May-1989	Dick Grune (dick) at dick
+	Added code in the X.l files to (silently) reject characters over 0200.
+	This does not really help, since lex stops on null chars. Ah, well.
+
+19-May-1989	Dick Grune (dick) at dick
+	Made the token as handled by sim into an abstract data type, for
+	aesthetic reasons. Sign extension is still a problem.
+
+03-May-1989	Dick Grune (dick) at dick
+	Optimized lcs() by first checking from the end if a sufficiently long
+	run is present; if in fact only the first 12 tokens match, chances
+	are good that you can reject the run right away by first testing
+	the 20th token, then the 19th, and so on.
+
+21-Apr-1989	Dick Grune (dick) at dick
+	A run of sim_m2 finding 7209 similarities raised the question of
+	the appropriateness of the linear sort in sort_pos(). Profiling
+	showed that in this case sorting takes all of 7.5 % of the total
+	time. Putting the word register in in the right places in
+	sort_pos() lowered this number to 4.6%.
+
+20-Apr-1989	Dick Grune (dick) at dick
+	Moved the test for MayBeStartOfRun() from compare.c (where it is
+	done again and again) to hash.c, where its effect is incorporated in
+	the forward reference chain.
+
+14-Apr-1989	Dick Grune (dick) at dick
+	Replaced elem_of() by bit tables, headers[] and trailers[], to be
+	prefilled from Headers[] and Trailers[] by a call of
+	InitLanguage(). This saves a few percents.
+
+13-Apr-1989	Dick Grune (dick) at dick
+	Implemented the -e and the -S option, by putting yet another loop
+	in compare.c
+
+13-Apr-1989	Dick Grune (dick) at dick
+	The -- option (displaying the tokens) will now handle more than one
+	file.
+
+20-Jan-1989	Dick Grune (dick) at dick
+	After the modification of 19-Dec-88, 12% of the time went into
+	updating the positions in the chunks, as they were produced by the
+	matching process. This matching process identifies runs (matches)
+	by token position, which has to be recalculated to lseek positions
+	and line numbers. To this end the files are read again, and for
+	each line all positions found were checked to see if they applied
+	to this line; this was a awfully stupid algorithm, but since much
+	more time was spent elsewhere, it did not really matter. With all
+	the saving below, however, it had risen to second position, after
+	yylook() with 35%.
+
+	Th solution was, to sort the positions in the same order in which
+	they would be met by the reading of the files. The process is then
+	linear. This required some extensive hacking in pass2.c
+
+06-Jan-1989	Dick Grune (dick) at dick
+	The modification below did indeed save 25%. The newline information
+	is now reduced to 2 shorts; 2 chars were not enough, since some
+	lines are longer that 127 bytes, and a char and a short together
+	take as much room as two shorts.
+
+19-Dec-1988	Dick Grune (dick) at dick
+	To avoid reading the files twice (which is still taking 25% of the
+	time), the first pass will now collect newline information for the
+	second pass in a buffer called nl_buff[].  This buffer, and the
+	original token buffer now named TokenArray[], are managed by the file
+	buff.c, which implements a layer between stream.h and pass?.c. This
+	layer provides OpenText(), NextTextToken() and CloseText(), each
+	with a parameter telling which pass it is.
+
+06-Dec-1988	Dick Grune (dick) at dick
+	As an introduction to removing the second pass altogether, the
+	first and second scan were unified, i.e., their input is identical.
+	This also means that the call sim -[12] has now been replaced by
+	one call:  sim --.
+
+23-Sep-1988	Dick Grune (dick) at dick
+	Dynamic allocation of line buffers in pass 3.  This removes the
+	restriction on the page width.
+
+22-Sep-1988	Dick Grune (dick) at dick
+	In order to give better messages on incorrect calls to sim, the
+	whole option handling has been concentrated in a file option.c and
+	separated from the options and their messages themselves. See sim.c
+
+07-Sep-1988	Dick Grune (dick) at dick
+	For long text sequences (say hundreds of thousands of tokens),
+	the hashing is not really efficient any more since too many
+	spurious matches occur.  Therefore, the forward reference table is
+	scanned a second time, eliminating from any chain all references to
+	runs that do not end in the same token.  For the UNIX manuals this
+	reduced the number of matches from 91.9% to 1.9% (of which 0.06%
+	were genuine).
+
+30-Aug-1988	Dick Grune (dick) at dick
+	For compatibility, NextTop has been rewritten to yield true or
+	false and to accept a pointer to a run as a parameter.
+
+30-Aug-1988	Dick Grune (dick) at dick
+	When trying to find line-number and lseek position to beginnings
+	and ends of runs found, the whole set of runs was scanned for each
+	line in each file.  Now only the runs belonging to that file are
+	scanned; to this end another linked list has been braided through
+	the data structures (tx_chunk).
+
+30-Aug-1988	Dick Grune (dick) at dick
+	The longest-common-substring algorithm was called much too often,
+	mainly because the forward references made by hashing suffered from
+	pollution.  If you have say 1000 tokens and a hash range of say
+	10000, about 5 % of the hashings will be false matches, i.e. 50
+	matches, which is quite a lot on a natural number of 2 to 3 matches.
+	Improved by doing a second check in make_forw_ref().
+
+12-Jun-1988	Dick Grune (dick) at dick
+	Installed a Lisp version supplied by Gertjan Akkerman.
+
+15-Jan-1988	Dick Grune (dick) at dick
+	Added register declarations all over the place.
+
+14-Jan-1988	Dick Grune (dick) at dick
+	It is often useful to match a piece of code exactly, especially
+	when function names (or, even more so, macro names) are involved.
+	What one would want is having all the letters in the text array,
+	but this is kind of hard, since each entry is one lexical item.
+	This means that under the -F option each letter is a lex item, and
+	normally each tag is a lex item; this requires two lex grammars in
+	one program; no good.  So, on the -F flag we hash the identifier
+	into one lex item, which is hopefully characteristic enough.  It
+	works.
+
+30-Sep-1987	Dick Grune (dick) at dick
+	Some cosmetics.
+
+31-Aug-1987	Dick Grune (dick) at dick
+	Moved the whole thing to the SUN (while testing on a VAX and a
+	MC68000)
+
+16-Aug-1987	Dick Grune (dick) at dick
+	The test program lang.c is no longer a main program, but rather a
+	subroutine called in main() in sim.c, through the command line
+	option -1 or -2.
+
+23-Apr-1987	Dick Grune (dick) at tjalk
+	Changed the name 'index' into 'elem_of', because of compatibility
+	problems on different Unices. Added a declaration for it in
+	the file algollike.c
+
+10-Mar-1987	Dick Grune (dick) at tjalk
+	Changed the printing of the header of a run so that:
+	-	long file names will no longer be truncated
+	-	the run length is displayed
+
+27-Jan-1987	Dick Grune (dick) at tjalk
+	Switched it right off again!  Getting them in textual order is
+	still more unpleasant, since now you cannot find the important
+	ones if their are more than a few runs.
+
+27-Jan-1987	Dick Grune (dick) at tjalk
+	Going to experiment with leaving out the sorting; just all the
+	runs, in the order we meet them.  Should be as good or better.
+	Comparisons of more than 100 runs are very rare anyway, so the
+	fact that those over a 100 are rejected is probably no great
+	help.  Getting them in a funny order is a nuisance, however.  Down
+	with featurism.  Just to be safe, present version saved as
+	870127.SV
+
+26-Dec-1986	Dick Grune (dick) at tjalk
+	Names of overall parameters in params.h changed to more uniformity.
+
+26-Dec-1986	Dick Grune (dick) at tjalk
+	Since the top package and the instantiation system have grown
+	apart so much, I have integrated the old top package into sim,
+	i.e., done the instantiation by hand.  This removes top.g and
+	top.p, and will save outsiders from wondering what is going on
+	here.
+
+23-Dec-1986	Dick Grune (dick) at tjalk
+	Use setbuf to print unbuffered while reading the files (lex core
+	dumps, other mishaps) and print buffered while printing the real
+	output (for speed).
+
+30-Nov-1986	Dick Grune (dick) at tjalk
+	Various small changes in *lang.l:
+		; ignored conditionally (!options['f'])
+		new format for tokens in struct idf
+		cosmetics: macro Layout, macro UnsafeComChar, no \n
+			in character denotations, more than one char
+			in a char denotations in Pascal, etc.
+
+30-Nov-1986	Dick Grune (dick) at tjalk
+	Added a Modula-2 version.
+
+29-Nov-1986	Dick Grune (dick) at tjalk
+	Restricting tokens to the ASCII95 character set is really too
+	severe: some languages have many more reserved words (COBOL!).
+	Corrected this by adding a couple of '&0377' in strategic places.
+	Added a routine for printing the 8-bit beasties: show_token().
+
+15-Aug-1986	Dick Grune (dick) at tjalk
+	Since the ; is superfluous in both C and Pascal, it is now ignored
+	by clang.l and pascallang.l
+
+15-Aug-1986	Dick Grune (dick) at tjalk
+	The code in CheckRun in Xlang.l was incorrect in that it used the
+	wrong criterion for throwing away trailing garbage. I've taken
+	CheckRun etc. out of the Xlang.l-s and turned them into a module
+	"algollike.c".  Made a cleaner interface and avoided duplication of
+	code.
+
+02-Jul-1986	Dick Grune (dick) at tjalk
+	Looking backwards in compare.c to see if we are in the middle of a
+	run is an atavism. You can be and still be all right, e.g., if
+	part of the run was rejected as not fitting for a function.
+	Removed from compare.c.
+
+10-Jun-1986	Dick Grune (dick) at tjalk
+	The function hash_code() in hash.c could yield a negative value;
+	corrected.
+
+09-Jun-1986	Dick Grune (dick) at tjalk
+	Changed the name of the file text.h to sim.h.  Sim.h is more
+	appropriate and text.h sounds as if it belongs to text.l, with
+	which it has no connection.
+
+04-Jun-1986	Dick Grune (dick) at tjalk
+	After having looked at a couple of hash functions and having done
+	some calculations on the number of duplicates normally encountered
+	in hash functions, I conclude that our function in hash.c is quite
+	good.  Removed all the statistics-gathering stuff.
+	
+	Actually, hash_table[] is not the hash table at all; it is a
+	forward reference table; likewise, the real hash table was called
+	last[].  Renamed both.
+	
+	There is a way to keep the hash table local without putting it on
+	the stack: use malloc().
+
+02-Jun-1986	Dick Grune (dick) at tjalk
+	Added a simple lex file for text: each word is condensed into a
+	hash code which is mapped on the ASCII95 character set.  This
+	turns out to be quite effective.
+
+01-Jun-1986	Dick Grune (dick) at tjalk
+	The macros cput(tk) and c_eol() both have a return in them, so any
+	code after them may not be executed -> they have to be last in an
+	entry.  But they weren't, in many places; I can't imagine why it
+	all worked nevertheless.  They have been renamed return_tk(tk) and
+	return_eol() and the entries have been restructured.
+
+30-May-1986	Dick Grune (dick) at tjalk
+	Moved the string and character entries in clang.l and pascallang.l
+	to a place behind the comment entries, to avoid strings (and
+	characters) being recognized inside comments.  I first thought
+	this would not happen, but as Maarten pointed out, if both
+	interpretations have the same length, lex will take the first
+	entry. Now this will happen if the string occupies the whole line
+	that would otherwise be taken as a comment.  In short,
+	/*
+	"hallo"
+	*/
+	would return ".
+
+28-May-1986	Dick Grune (dick) at tjalk
+	Added -d option, to display the output in diff(1) format (courtesy
+	of Maarten van der Meulen).
+	Rewrote the lexical parsing of comments (likewise courtesy Maarten
+	van der Meulen).
+
+20-May-1986	Dick Grune (dick) at tjalk
+	Added a routine to convert identifiers to lower case in
+	pascallang.l .
+
+19-May-1986	Dick Grune (dick) at tjalk
+	Added -a option, to quickly check antecedent of a file (courtesy
+	of Maarten van der Meulen).
+
+18-May-1986	Dick Grune (dick) at tjalk
+	Brought everything under RCS/CVS.
+
+18-Mar-1986	Dick Grune (dick) at tjalk
+	Added modifications by Paul Bame (hp-lsd!paul@hp-labs) to have an
+	option -w to set the page width.
+
+21-Feb-1986	Dick Grune (dick) at tjalk
+	Took array last[N_HASH] out of make_hash() in hash.c, due to stack
+	overflow on the Gould (reported by George Walker
+	[email protected])
+
+16-Feb-1986	Dick Grune (dick) at tjalk
+	Corrected some subtractions that caused unsigned ints to turn
+	pseudo-negative. (Reported by jaap@mcvax)
+
+11-Jan-1986	Dick Grune (dick) at tjalk
+	Touched up for distribution.
+
+10-Jan-1986	Dick Grune (dick) at tjalk
+	Fill_line was not called for empty lines, which caused them to be
+	printed as repetitions of the previous line.
+
+24-Dec-1985	Dick Grune (dick) at tjalk
+	Reduced hash table to a single array of indices; it is used only
+	in one place, which makes it very easy to make it (the hash table)
+	optional.  General tune-up of everything.  This seems to be
+	another stable "final" version.
+
+14-Dec-1985	Dick Grune (dick) at tjalk
+	Some experiments with hash formulas:
+	h = (h OP CST) + *p++ OP CST yields	right	wrong
+		* 96		- 32		205	562
+		* 96		- 2		205	560
+		* 96				205	560
+		* 97				205	559
+		<< 0				 66	3128
+		<< 1				203	555
+		<< 2				205	536
+		<< 7				203	540
+	Conclusion: it doesn't matter, unless you do it wrong.
+
+01-Oct-1983	Dic8k Grune (dick) at vu44
+	Oldest known files.
+
+#	This file is part of the software similarity tester SIM.
+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+#	$Id: ChangeLog,v 2.12 2007/08/27 09:57:30 dick Exp $
+#

+ 31 - 0
utils/sim_pasc/LICENSE.txt

@@ -0,0 +1,31 @@
+Copyright (c) 1986, 2007, Dick Grune, Vrije Universiteit, The Netherlands
+All rights reserved.
+
+Redistribution and use in source and binary forms,
+with or without modification, are permitted provided
+that the following conditions are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+
+   * Neither the name of the Vrije Universiteit nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ 566 - 0
utils/sim_pasc/Makefile

@@ -0,0 +1,566 @@
+#	This file is part of the software similarity tester SIM.
+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+#	$Id: Makefile,v 2.17 2007/08/27 09:57:31 dick Exp $
+#
+
+
+#	E N T R Y   P O I N T S
+test_sim:
+
+help:
+	@echo 'Entry points:'
+	@echo 'test_sim:	compile sim_c and run a simple test (default)'
+	@echo ''
+	@echo 'all:		create all binaries'
+	@echo 'sim_X(.exe):	create specific binary for language X'
+	@echo 'install_all:	install all binaries'
+	@echo 'install.sim_X:	install specific binary for language X'
+	@echo 'where X is one of c, java, pasc, m2, lisp, mira, text'
+	@echo ''
+	@echo 'lint:		lint sim_c sources'
+	@echo 'lint.all:	lint all sim sources'
+	@echo 'simsim:		run sim on the sim sources'
+	@echo ''
+	@echo 'simsrc.shr:	create sources shar file'
+	@echo 'simsrc.zip:	create sources zip file'
+	@echo 'depend:		update dependencies in Makefile'
+	@echo 'clean:		remove created files'
+	@echo ''
+	@echo 'sim_exes:	create DOS executables in MSDOS; set date; make clean'
+	@echo 'simexe.zip:	create DOS executables package in UNIX'
+	@echo 'VERSION=2.X install_ftp:	install in the FTP directory in UNIX'
+
+VERSION =	2_21
+
+#
+# When you modify any of the following flags, do 'make clean'
+#
+
+include		sysidf.mk
+
+
+# Flags
+OPTLEVEL =	-O4#			#
+
+CFLAGS =	$(SYSTEM) $(OPTLEVEL) $(TESTTOKEN)
+LFLAGS =	#			# loader flags
+
+TESTTOKEN =	#-DTESTTOKEN#		# define to test the token type
+
+
+#	T E S T   P A R A M E T E R S
+
+# Parameters for two simple test runs, sim.res and stream.res:
+TEST_LANG =	c#			# to test sim_X for language X
+TEST_OPT =	-f -r 20#		# options to sim_X
+TEST_INP =	pass3.c#		# guinea pig input
+
+TEST_OPT =	-p#			# options to sim_X
+TEST_INP =	*.l#			# guinea pig input
+TEST_INP =	simple*#		# guinea pig input
+
+
+#	I N T R O D U C T I O N
+
+#	Each module (set of programs that together perform some function)
+#	has the following sets of files defined for it:
+#		_FLS	all files of that module, for, e.g.,
+#			sharring, inventory, etc.
+#		_SRC	the source files, from which other files derive
+#		_CFS	the C-files, from which the object files derive
+#		_OBJ	object files
+#		_GRB	garbage files produced by compiling the module
+#
+#	(This is a feeble attempt at software-engineering a Makefile.)
+#
+
+test_sim:	sim.res stream.res	# two simple tests
+
+
+#	B I N A R I E S
+
+BINARIES =	sim_c$(EXE) sim_java$(EXE) sim_pasc$(EXE) \
+		sim_m2$(EXE) sim_lisp$(EXE) sim_mira$(EXE) \
+		sim_text$(EXE)
+
+all:		$(BINARIES)
+
+
+#	C O M P I L A T I O N   R U L E S
+
+.SUFFIXES:	.o
+.c.o:
+		$(CC) -c $(CFLAGS) $<
+
+
+#	A U X I L I A R Y   M O D U L E S
+
+# Common modules:
+COM_CFS =	token.c lex.c stream.c text.c tokenarray.c error.c
+COM_OBJ =	token.o lex.o stream.o text.o tokenarray.o error.o
+COM_SRC =	token.h lex.h stream.h text.h tokenarray.h error.h \
+		lang.h language.h \
+		sortlist.spc sortlist.bdy system.par $(COM_CFS)
+COM_FLS =	$(COM_SRC)
+
+# The idf module:
+IDF_CFS =	idf.c
+IDF_OBJ =	idf.o
+IDF_SRC =	idf.h $(IDF_CFS)
+IDF_FLS =	$(IDF_SRC)
+
+# The runs package:
+RUNS_CFS =	runs.c percentages.c
+RUNS_OBJ =	runs.o percentages.o
+RUNS_SRC =	runs.h percentages.h $(RUNS_CFS)
+RUNS_FLS =	$(RUNS_SRC) aiso.spc aiso.bdy
+
+# The main program:
+MAIN_CFS =	sim.c options.c pass1.c hash.c compare.c add_run.c \
+		pass2.c pass3.c
+MAIN_OBJ =	sim.o options.o pass1.o hash.o compare.o add_run.o \
+		pass2.o pass3.o
+MAIN_SRC =	sim.h options.h pass1.h hash.h compare.h add_run.h \
+		pass2.h pass3.h \
+		debug.par settings.par $(MAIN_CFS)
+MAIN_FLS =	$(MAIN_SRC)
+
+# The similarity tester without the language part:
+SIM_CFS =	$(COM_CFS) $(IDF_CFS) $(RUNS_CFS) $(MAIN_CFS)
+SIM_OBJ =	$(COM_OBJ) $(IDF_OBJ) $(RUNS_OBJ) $(MAIN_OBJ)
+SIM_SRC =	$(COM_SRC) $(IDF_SRC) $(RUNS_SRC) $(MAIN_SRC)
+SIM_FLS =	$(COM_FLS) $(IDF_FLS) $(RUNS_FLS) $(MAIN_FLS)
+
+
+#	L A N G U A G E S
+
+# The algollike module:
+ALG_CFS =	algollike.c
+ALG_OBJ =	algollike.o
+ALG_SRC =	algollike.h $(ALG_CFS)
+ALG_FLS =	$(ALG_SRC)
+
+# The C Language module:					C
+CLANG_CFS =	clang.c
+CLANG_OBJ =	clang.o
+CLANG_SRC =	clang.l
+CLANG_FLS =	$(CLANG_SRC)
+
+clang.c:	clang.l
+		$(LEX) -t clang.l >$@
+
+SIM_C_CFS =	$(SIM_CFS) $(ALG_CFS) $(CLANG_CFS)
+SIM_C_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(CLANG_OBJ)
+
+sim_c$(EXE):	$(SIM_C_OBJ)
+		$(CC) $(LFLAGS) $(SIM_C_OBJ) -o $@
+
+SIM_C_GRB =	clang.c sim_c
+
+install.sim_c:	$(BINDIR)/sim_c$(EXE) $(MANDIR)/sim.1
+
+$(BINDIR)/sim_c$(EXE):	sim_c$(EXE)
+		$(COPY) sim_c$(EXE) $@
+
+# The Java Language module:					Java
+JAVALANG_CFS =	javalang.c
+JAVALANG_OBJ =	javalang.o
+JAVALANG_SRC =	javalang.l
+JAVALANG_FLS =	$(JAVALANG_SRC)
+
+javalang.c:	javalang.l
+		$(LEX) -t javalang.l >$@
+
+SIM_JAVA_CFS =	$(SIM_CFS) $(ALG_CFS) $(JAVALANG_CFS)
+SIM_JAVA_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(JAVALANG_OBJ)
+
+sim_java$(EXE):	$(SIM_JAVA_OBJ)
+		$(CC) $(LFLAGS) $(SIM_JAVA_OBJ) -o $@
+
+SIM_JAVA_GRB =	javalang.c sim_java
+
+install.sim_java:	$(BINDIR)/sim_java$(EXE) $(MANDIR)/sim.1
+
+$(BINDIR)/sim_java$(EXE):	sim_java$(EXE)
+		$(COPY) sim_java$(EXE) $@
+
+# The Pascal Language module:					Pascal
+PASCLANG_CFS =	pascallang.c
+PASCLANG_OBJ =	pascallang.o
+PASCLANG_SRC =	pascallang.l
+PASCLANG_FLS =	$(PASCLANG_SRC)
+
+pascallang.c:	pascallang.l
+		$(LEX) -t pascallang.l >pascallang.c
+
+SIM_PASC_CFS =	$(SIM_CFS) $(ALG_CFS) $(PASCLANG_CFS)
+SIM_PASC_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(PASCLANG_OBJ)
+
+sim_pasc$(EXE):	$(SIM_PASC_OBJ)
+		$(CC) $(LFLAGS) $(SIM_PASC_OBJ) -o $@
+
+SIM_PASC_GRB =	pascallang.c sim_pasc
+
+install.sim_pasc:	$(BINDIR)/sim_pasc$(EXE) $(MANDIR)/sim.1
+
+$(BINDIR)/sim_pasc$(EXE):	sim_pasc$(EXE)
+		$(COPY) sim_pasc$(EXE) $@
+
+# The Modula-2 Language module:					Modula-2
+M2LANG_CFS =	m2lang.c
+M2LANG_OBJ =	m2lang.o
+M2LANG_SRC =	m2lang.l
+M2LANG_FLS =	$(M2LANG_SRC)
+
+m2lang.c:	m2lang.l
+		$(LEX) -t m2lang.l >$@
+
+SIM_M2_CFS =	$(SIM_CFS) $(ALG_CFS) $(M2LANG_CFS)
+SIM_M2_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(M2LANG_OBJ)
+
+sim_m2$(EXE):	$(SIM_M2_OBJ)
+		$(CC) $(LFLAGS) $(SIM_M2_OBJ) -o $@
+
+SIM_M2_GRB =	m2lang.c sim_m2
+
+install.sim_m2:	$(BINDIR)/sim_m2$(EXE) $(MANDIR)/sim.1
+
+$(BINDIR)/sim_m2$(EXE):	sim_m2$(EXE)
+		$(COPY) sim_m2$(EXE) $@
+
+# The Lisp Language module:					Lisp
+LISPLANG_CFS =	lisplang.c
+LISPLANG_OBJ =	lisplang.o
+LISPLANG_SRC =	lisplang.l
+LISPLANG_FLS =	$(LISPLANG_SRC)
+
+lisplang.c:	lisplang.l
+		$(LEX) -t lisplang.l >$@
+
+SIM_LISP_CFS =	$(SIM_CFS) $(ALG_CFS) $(LISPLANG_CFS)
+SIM_LISP_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(LISPLANG_OBJ)
+
+sim_lisp$(EXE):	$(SIM_LISP_OBJ)
+		$(CC) $(LFLAGS) $(SIM_LISP_OBJ) -o $@
+
+SIM_LISP_GRB =	lisplang.c sim_lisp
+
+install.sim_lisp:	$(BINDIR)/sim_lisp$(EXE) $(MANDIR)/sim.1
+
+$(BINDIR)/sim_lisp$(EXE):	sim_lisp$(EXE)
+		$(COPY) sim_lisp$(EXE) $@
+
+# The Miranda Language module:					Miranda
+MIRALANG_CFS =	miralang.c
+MIRALANG_OBJ =	miralang.o
+MIRALANG_SRC =	miralang.l
+MIRALANG_FLS =	$(MIRALANG_SRC)
+
+miralang.c:	miralang.l
+		$(LEX) -t miralang.l >$@
+
+SIM_MIRA_CFS =	$(SIM_CFS) $(ALG_CFS) $(MIRALANG_CFS)
+SIM_MIRA_OBJ =	$(SIM_OBJ) $(ALG_OBJ) $(MIRALANG_OBJ)
+
+sim_mira$(EXE):	$(SIM_MIRA_OBJ)
+		$(CC) $(LFLAGS) $(SIM_MIRA_OBJ) -o $@
+
+SIM_MIRA_GRB =	miralang.c sim_mira
+
+install.sim_mira:	$(BINDIR)/sim_mira$(EXE) $(MANDIR)/sim.1
+
+$(BINDIR)/sim_mira$(EXE):	sim_mira$(EXE)
+		$(COPY) sim_mira$(EXE) $@
+
+# The Text module:						Text
+TEXTLANG_CFS =	textlang.c
+TEXTLANG_OBJ =	textlang.o
+TEXTLANG_SRC =	textlang.l
+TEXTLANG_FLS =	$(TEXTLANG_SRC)
+
+textlang.c:	textlang.l
+		$(LEX) -t textlang.l >$@
+
+SIM_TEXT_CFS =	$(SIM_CFS) $(TEXTLANG_CFS)
+SIM_TEXT_OBJ =	$(SIM_OBJ) $(TEXTLANG_OBJ)
+
+sim_text$(EXE):	$(SIM_TEXT_OBJ)
+		$(CC) $(LFLAGS) $(SIM_TEXT_OBJ) -o $@
+
+SIM_TEXT_GRB =	textlang.c sim_text
+
+install.sim_text:	$(BINDIR)/sim_text$(EXE) $(MANDIR)/sim.1
+
+$(BINDIR)/sim_text$(EXE):	sim_text$(EXE)
+		$(COPY) sim_text$(EXE) $@
+
+
+#	T E S T S
+
+# Some simple tests:
+sim.res:	sim_$(TEST_LANG)$(EXE) $(TEST_INP)
+		./sim_$(TEST_LANG)$(EXE) $(TEST_OPT) $(TEST_INP)
+#		./sim_$(TEST_LANG)$(EXE) -x $(TEST_OPT) $(TEST_INP)
+
+stream.res:	sim_$(TEST_LANG)$(EXE) $(TEST_INP)
+		./sim_$(TEST_LANG)$(EXE) -- $(TEST_INP) >stream.res
+		wc stream.res $(TEST_INP)
+
+TEST_GRB =	stream.res
+
+# More simple tests, on the C version only:
+simsim:		sim_c$(EXE) $(SRC)
+		./sim_c$(EXE) -fr 20 $(SRC)
+
+# Lint
+lint:		$(SIM_C_CFS)
+		$(LINT) $(LINTFLAGS) $(SIM_C_CFS) | grep -v yy
+
+lint.all:	$(SIM_C_CFS) $(SIM_JAVA_CFS) $(SIM_PASC_CFS) $(SIM_M2_CFS) \
+		$(SIM_LISP_CFS) $(SIM_MIRA_CFS) $(SIM_TEXT_CFS)
+		$(LINT) $(LINTFLAGS) $(SIM_C_CFS) | grep -v yy
+		$(LINT) $(LINTFLAGS) $(SIM_JAVA_CFS) | grep -v yy
+		$(LINT) $(LINTFLAGS) $(SIM_PASC_CFS) | grep -v yy
+		$(LINT) $(LINTFLAGS) $(SIM_M2_CFS) | grep -v yy
+		$(LINT) $(LINTFLAGS) $(SIM_LISP_CFS) | grep -v yy
+		$(LINT) $(LINTFLAGS) $(SIM_MIRA_CFS) | grep -v yy
+		$(LINT) $(LINTFLAGS) $(SIM_TEXT_CFS) | grep -v yy
+
+
+#	O T H E R   E N T R I E S
+
+# Sets of files: general, modules, main programs, languages
+CFS =		$(SIM_CFS) $(ALG_CFS) \
+		$(CLANG_CFS) $(JAVALANG_CFS) $(PASCLANG_CFS) $(M2LANG_CFS) \
+		$(LISPLANG_CFS) $(MIRALANG_CFS) $(TEXTLANG_CFS)
+OBJ =		$(SIM_OBJ) $(ALG_OBJ) \
+		$(CLANG_OBJ) $(JAVALANG_OBJ) $(PASCLANG_OBJ) $(M2LANG_OBJ) \
+		$(LISPLANG_OBJ) $(MIRALANG_OBJ) $(TEXTLANG_OBJ)
+SRC =		$(SIM_SRC) $(ALG_SRC) \
+		$(CLANG_SRC) $(JAVALANG_SRC) $(PASCLANG_SRC) $(M2LANG_SRC) \
+		$(LISPLANG_SRC) $(MIRALANG_SRC) $(TEXTLANG_SRC)
+FLS =		$(SIM_FLS) $(ALG_FLS) \
+		$(CLANG_FLS) $(JAVALANG_FLS) $(PASCLANG_FLS) $(M2LANG_FLS) \
+		$(LISPLANG_FLS) $(MIRALANG_FLS) $(TEXTLANG_FLS) \
+		sysidf.mk sysidf.msdos sysidf.unix
+DOC =		READ_ME READ.ME README.1st sim.1 sim.txt sim.html \
+		ChangeLog Answers TechnReport
+
+ALL_FLS =	Makefile $(FLS) $(DOC)
+
+# Create .EXE archive for MSDOS
+SIM_EXES =	sim_c.exe sim_java.exe sim_pasc.exe sim_m2.exe \
+		sim_lisp.exe sim_mira.exe sim_text.exe
+DOSZIP =	READ.ME sim.txt $(SIM_EXES)
+sim_exes:	$(SIM_EXES)
+
+simexe.zip:	$(DOSZIP)
+		$(ZIP) $@ $(DOSZIP)
+
+DOS_GRB =	simexe.zip
+
+# Install and clean scripts
+install_all:	install			# just a synonym
+install:	install.sim_c install.sim_java install.sim_pasc \
+		install.sim_m2 install.sim_lisp install.sim_mira \
+		install.sim_text
+
+$(MANDIR)/sim.1:	sim.1
+		$(COPY) sim.1 $@
+
+FTPFILES =	README.1st READ_ME LICENSE.txt TechnReport
+
+install_ftp:	$(FTPFILES) simsrc.shr simexe.zip sim.pdf
+		cp -p simsrc.shr sim_$(VERSION).shar
+		cp -p simexe.zip sim_$(VERSION).zip
+		cp -p $(FTPFILES) sim_$(VERSION).shar sim_$(VERSION).zip \
+			README.1st READ.ME READ_ME sim.pdf \
+			$(FTPDIR)/.
+		rm -f sim_$(VERSION).shar sim_$(VERSION).zip
+		ls -l $(FTPDIR)/.
+
+simsrc.shr:	$(ALL_FLS)
+		shar $(ALL_FLS) >$@
+
+simsrc.zip:	$(ALL_FLS)
+		$(ZIP) $@ $(ALL_FLS)
+
+sim.txt:	sim.1
+		nroff -man sim.1 | sed 's/.//g' >$@
+
+sim.pdf:	sim.1
+		troff -man sim.1 | devps | ps2pdf -sPAPERSIZE=a4 - $@
+
+INSTALL_GRB =	simsrc.shr simsrc.zip sim.txt sim.pdf
+
+depend:		$(CFS)
+		makedepend -w 1 -Dlint $(CFS)
+
+.PHONY:		clean fresh
+clean:
+		-rm -f *.o
+		-rm -f $(SIM_C_GRB)
+		-rm -f $(SIM_JAVA_GRB)
+		-rm -f $(SIM_PASC_GRB)
+		-rm -f $(SIM_M2_GRB)
+		-rm -f $(SIM_LISP_GRB)
+		-rm -f $(SIM_MIRA_GRB)
+		-rm -f $(SIM_TEXT_GRB)
+		-rm -f $(TEST_GRB)
+		-rm -f $(INSTALL_GRB)
+		-rm -f a.out a.exe sim.txt core mon.out
+
+fresh:		clean
+		-rm -f $(DOS_GRB)
+		-rm -f *.exe
+
+#	D E P E N D E N C I E S
+
+# DO NOT DELETE THIS LINE -- make depend depends on it.
+
+token.o: token.h
+lex.o: token.h
+lex.o: lex.h
+stream.o: system.par
+stream.o: token.h
+stream.o: lex.h
+stream.o: lang.h
+stream.o: stream.h
+text.o: debug.par
+text.o: sim.h
+text.o: token.h
+text.o: stream.h
+text.o: lex.h
+text.o: options.h
+text.o: error.h
+text.o: text.h
+tokenarray.o: error.h
+tokenarray.o: lex.h
+tokenarray.o: token.h
+tokenarray.o: tokenarray.h
+error.o: sim.h
+error.o: error.h
+idf.o: system.par
+idf.o: token.h
+idf.o: idf.h
+runs.o: sim.h
+runs.o: runs.h
+runs.o: aiso.spc
+runs.o: aiso.bdy
+percentages.o: sim.h
+percentages.o: runs.h
+percentages.o: aiso.spc
+percentages.o: error.h
+percentages.o: percentages.h
+percentages.o: sortlist.bdy
+sim.o: settings.par
+sim.o: sim.h
+sim.o: options.h
+sim.o: language.h
+sim.o: token.h
+sim.o: error.h
+sim.o: hash.h
+sim.o: compare.h
+sim.o: pass1.h
+sim.o: pass2.h
+sim.o: pass3.h
+sim.o: stream.h
+sim.o: lex.h
+options.o: options.h
+pass1.o: debug.par
+pass1.o: sim.h
+pass1.o: text.h
+pass1.o: tokenarray.h
+pass1.o: token.h
+pass1.o: lex.h
+pass1.o: error.h
+pass1.o: pass1.h
+hash.o: system.par
+hash.o: debug.par
+hash.o: sim.h
+hash.o: error.h
+hash.o: language.h
+hash.o: token.h
+hash.o: tokenarray.h
+hash.o: options.h
+hash.o: hash.h
+compare.o: sim.h
+compare.o: tokenarray.h
+compare.o: token.h
+compare.o: hash.h
+compare.o: language.h
+compare.o: options.h
+compare.o: add_run.h
+compare.o: compare.h
+add_run.o: sim.h
+add_run.o: runs.h
+add_run.o: aiso.spc
+add_run.o: percentages.h
+add_run.o: options.h
+add_run.o: error.h
+add_run.o: add_run.h
+pass2.o: debug.par
+pass2.o: sim.h
+pass2.o: text.h
+pass2.o: lex.h
+pass2.o: token.h
+pass2.o: pass2.h
+pass2.o: sortlist.bdy
+pass3.o: system.par
+pass3.o: debug.par
+pass3.o: sim.h
+pass3.o: runs.h
+pass3.o: aiso.spc
+pass3.o: error.h
+pass3.o: options.h
+pass3.o: pass3.h
+pass3.o: percentages.h
+pass3.o: tokenarray.h
+pass3.o: token.h
+algollike.o: options.h
+algollike.o: token.h
+algollike.o: algollike.h
+algollike.o: language.h
+clang.o: options.h
+clang.o: algollike.h
+clang.o: language.h
+clang.o: token.h
+clang.o: idf.h
+clang.o: lex.h
+clang.o: lang.h
+javalang.o: options.h
+javalang.o: algollike.h
+javalang.o: language.h
+javalang.o: token.h
+javalang.o: idf.h
+javalang.o: lex.h
+javalang.o: lang.h
+pascallang.o: options.h
+pascallang.o: algollike.h
+pascallang.o: language.h
+pascallang.o: token.h
+pascallang.o: idf.h
+pascallang.o: lex.h
+pascallang.o: lang.h
+m2lang.o: options.h
+m2lang.o: algollike.h
+m2lang.o: language.h
+m2lang.o: token.h
+m2lang.o: idf.h
+m2lang.o: lex.h
+m2lang.o: lang.h
+lisplang.o: language.h
+lisplang.o: token.h
+lisplang.o: lex.h
+lisplang.o: lang.h
+lisplang.o: idf.h
+miralang.o: language.h
+miralang.o: token.h
+miralang.o: lex.h
+miralang.o: lang.h
+miralang.o: idf.h
+textlang.o: language.h
+textlang.o: token.h
+textlang.o: idf.h
+textlang.o: lex.h
+textlang.o: lang.h

+ 34 - 0
utils/sim_pasc/READ.ME

@@ -0,0 +1,34 @@
+#	This file is part of the software similarity tester SIM.
+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+#	$Id: READ.ME,v 2.8 2005/02/20 17:02:58 dick Exp $
+
+These programs test for similar (or equal) stretches in one or more program
+files and can be used to detect common code or plagiarism. See SIM.DOC.
+Checkers are available for C, Java, Pascal, Modula-2, Lisp, Miranda and
+natural text.
+
+This READ.ME file describes the MSDOS version. The UNIX version is described
+in the file READ_ME.
+
+The archive SIM_2_21.ZIP contains:
+	READ.ME			this READ.ME file
+	SIM.TXT			a 2-page manual, UNIX-style
+	SIM_C.EXE		similarity tester for C
+	SIM_JAVA.EXE		similarity tester for Java
+	SIM_PASC.EXE		similarity tester for Pascal
+	SIM_M2.EXE		similarity tester for Modula-2
+	SIM_LISP.EXE		similarity tester for Lisp
+	SIM_MIRA.EXE		similarity tester for Miranda
+	SIM_TEXT.EXE		similarity tester for text
+
+The MSDOS version does not contain sources. The sources are available from
+the UNIX archive sim_2_21.shar, but require a C compiler, flex and make.
+
+					Dick Grune
+					Vrije Universiteit
+					de Boelelaan 1081
+					1081 HV  Amsterdam
+					the Netherlands
+					email: [email protected]
+					ftp://ftp.cs.vu.nl/pub/dick
+					http://www.cs.vu.nl/~dick

+ 68 - 0
utils/sim_pasc/README.1st

@@ -0,0 +1,68 @@
+This is SIM, Software and text similarity tester, most recent revision
+                                                               (2.19, 20050220)
+by Dick Grune, Vrije Universiteit, Amsterdam, the Netherlands ([email protected]).
+
+SIM tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp,
+Miranda and natural language. It can be used
+
+- to detect potentially duplicated code fragments in large software projects,
+- to detect plagiarism in software and text-based projects, educational and
+  otherwise.
+
+The program is fast:
+the UNIX version on a Sun ULTRA does about 50000 tokens/sec,
+the DOS version on a Pentium 166 does about 25000 tokens/sec.
+
+SIM is available for UNIX (in source code) and MSDOS (32-bit executables).
+
+UNIX:
+	To obtain the files, do:
+		sh sim_2_21.shar
+	This unpacks the sources, the Makefile, sim.1 and READ_ME.
+	For installation notes and other info then see READ_ME.
+
+MSDOS:
+	To obtain the files, do:
+		[pk]unzip SIM_2_21.zip
+	This unpacks the executables, SIM.DOC and READ.ME.
+	For other info then see READ.ME.
+
+Changes from Release 2.19:
+	Various changes necessitated by Linux flex being different
+
+Changes from Release 2.16:
+	Various updates and adjustments in the code and the installation
+	procedure.
+
+Changes from Release 2.13:
+	Percentage reporting feature added.
+
+Changes from Release 2.12:
+	Miranda checker added.
+
+Changes from Release 2.9:
+	Java checker added.
+	The C checker 'sim' was renamed to 'sim_c', for uniformity.
+	Converted the sources to ANSI C.
+	All versions now report non_ASCI characters in the input.
+
+Changes from Release 2.8:
+	DOS versions can now compare very large files (>400000 tokens)
+
+Changes from Release 1.21, as posted in comp.sources.unix (1987):
+	Ported to MSDOS
+	Significant speed improvements
+	New options: -e, -S and / , to compare files group-wise
+	New option: -F , to require function names to match exactly
+	Lisp version added
+	Miscellaneous improvements
+
+
+					Dick Grune
+					Vrije Universiteit
+					de Boelelaan 1081
+					1081 HV  Amsterdam
+					the Netherlands
+					email: [email protected]
+					ftp://ftp.cs.vu.nl/pub/dick
+					http://www.cs.vu.nl/~dick

+ 52 - 0
utils/sim_pasc/READ_ME

@@ -0,0 +1,52 @@
+#	This file is part of the software similarity tester SIM.
+#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+#	$Id: READ_ME,v 2.6 2005/02/20 17:02:59 dick Exp $
+
+These programs test for similar (or equal) stretches in one or more program
+files and can be used to detect common code or plagiarism. See sim.1.
+Checkers are available for C, Java, Pascal, Modula-2, Lisp, Miranda and
+natural text.
+
+This READ_ME file describes the UNIX version. The MSDOS version is described
+in the file READ.ME.
+
+To obtain the sources, do
+	sh sim_2_21.shar
+
+To compile and test, just call
+	make
+This will generate one executable called sim_c, the checker for C, and will
+run two small tests to show sample output.
+
+To install, examine sysidf.mk, reset BINDIR and MANDIR to sensible paths,
+and call
+	make install.sim_c			for C
+	make install.sim_java			for Java
+	make install.sim_pasc			for Pascal
+	make install.sim_m2			for Modula-2
+	make install.sim_lisp			for Lisp
+	make install.sim_mira			for Miranda
+	make install.sim_text			for text
+or
+	make install.all			for everything.
+These will also install the manual page.
+
+To change the default run size or the page width, adjust the file params.h
+and recompile.
+
+To add another language L, write a file Llang.l along the lines of clang.l
+and the other *lang.l files, extend the Makefile and recompile.
+All knowledge about a given language L is located in Llang.l; the rest of
+the programs expect each token to be a single character.
+
+Available at present:
+	clang.l javalang.l pascallang.l m2lang.l lisplang.l miralang.l text.l
+
+					Dick Grune
+					Vrije Universiteit
+					de Boelelaan 1081
+					1081 HV  Amsterdam
+					the Netherlands
+					email: [email protected]
+					ftp://ftp.cs.vu.nl/pub/dick
+					http://www.cs.vu.nl/~dick

+ 214 - 0
utils/sim_pasc/TechnReport

@@ -0,0 +1,214 @@
+		CONCISE REPORT ON THE ALGORITHMS IN SIM			970623
+
+
+
+	INTRODUCTION
+
+The general outline of the similarity checker is as follows:
+
+	1. the files are read in (pass 1)
+	2. a forward-reference table is prepared
+	3. the set of interesting runs is determined
+	4. the line numbers of the runs are determined (pass 2)
+	5. the contents of the runs are printed in order (pass 3)
+
+To keep the memory requirements (relatively) small, the exact positions
+of the tokens are not recorded.  This necessitates pass 2.  See, however,
+the pertinent chapter.
+
+
+	READING THE FILES
+
+Each file is tokenized using an lex-generated scanner appropriate for
+the input.  Each token fits in one byte, possibly using all 8 bits.  The
+tokens are stored in the array TokenArray[], which is extended by
+reallocation if it overflows.  See tokenarray.c.
+
+Also, to optimize away pass 2, an attempt is made to remember the token
+positions of all beginnings of lines.  The token-positions at BOL are
+stored in the array nl_buff[], which is also extended by reallocation,
+if needed.  If the attempt fails due to lack of memory, nl_buff[] is
+abandoned, and pass2 will read the files instead.
+
+
+	PREPARING THE FORWARD-REFERENCE TABLE
+
+Text is compared by comparing every substring to all substrings
+to the right of it; this process is in essence quadratic.  However,
+only substrings of length at least 'MinRunSize' are of interest,
+which gives us the possibility to speed up this process by using
+a hash table.
+
+Once the entire text has been read in, a forward-reference table
+forward_references[] is made (see hash.c).
+For every position in the text, we construct an index which gives
+the next position in the text where a run of MinRunSize tokens
+starts that has the same hash code.  If there is no such run, the
+index is 0.
+
+To fill in this array, we use a hash table last_index[], such that
+last_index[i] is the index of the latest token with hash_code i, or 0 if
+there is none.  If at a given position p, we find that the text ahead of
+us has hash code i, last_index[i] tells us which position in
+forward_references[] will have to be updated to p.
+See MakeForwardReferences().
+
+For long text sequences (say hundreds of thousands of tokens), the
+hashing is not really efficient any more since too many spurious matches
+occur.  Therefore, the forward reference table is scanned a second time,
+eliminating from any chain all references to runs that do not start with
+and end in the same token (actually this is a second hash code).
+For the UNIX manuals this reduced the number of matches from 91.9% to 1.9%
+(of which 0.06% was genuine).
+
+	DETERMINING THE SET OF INTERESTING RUNS
+
+The overall structure of the routine Compare() (see compare.c) is:
+
+for all new files
+	for all texts it must be compared to
+		for all positions in the new file
+			for all positions in the text
+				for ever increasing sizes
+					try to match and keep the best
+
+If for a given position in the new file a good run (i.e. on of at least
+minimum length) has been found, the run is registered using a call of
+add_run(), the run is skipped in the new file and searching continues at
+the position after it.  This prevents duplicate reports of runs.
+
+Add_run() allocates a struct run for the run (see sim.h)
+which contains two struct chunks and a quality description.  It fills
+in the two chunks with the pertinent info, one for the first file and
+one for the second (which may be the same, if the run relates two chunks
+in the same file).
+
+The run is then entered into the arbitrary-in-sorted-out store AISO (see
+aiso.spc and aiso.bdy, a genuine generic abstract data type in C!), in
+which it is inserted according to its quality.  Both positions
+(struct position) in both chunks in the run (so four in total) are each
+entered in a linked list starting at the tx_pos field in the struct text
+of the appropriate file.
+
+When this is finished, the forward reference table can be deleted.
+
+So the final results of this phase are visible both through the tx_pos
+fields and through the aiso interface.
+
+
+	DETERMINING THE EXACT POSITION OF EACH RUN (PASS 2)
+
+The purpose of this pass is to find for each chunk, which up to now is
+known by token position only, its starting and ending line number (which
+cannot be easily derived from the token position).
+
+For each file that has a non-zero tx_pos field, ie. that has some
+interesting chunks, the positions in the tx_pos list are sorted on
+ascending line number (they have been found in essentially arbitrary
+order) by sort_pos() in pass2.c.
+
+Next we scan the pos list and the file in parallel, updating the info in
+a position when we meet it.  A position carries an indication whether it
+is a starting or an ending position, since slightly differing
+calculations have to be done in each case.
+
+Actually, if the nl_buff[] data structure still exists, the file is not
+accessed at all and the data from nl_buff[] is used instead.  This is
+done transparently in buff.c.
+
+
+	PRINTING THE CONTENTS OF THE RUNS (PASS 3)
+
+Since each struct run has now been completely filled in, this is simple;
+the hard work is calculating the page layout.
+Pass3() accesses the aiso store and retrieves from it the runs in
+descending order of importance.  Show_run() opens both files, positions
+them using the line numbers and prints the runs.
+
+================================================================
+	CODE EXCERPT OF THE SOFTWARE SIMILARITY TESTER SIM (980222)
+
+sim:
+	get command line options
+	check the options
+
+	init language, to precompute tables
+
+	pass1, read the files
+		# there is an array TokenArray[] that holds all input tokens
+
+	make forward reference table
+		# there is an array forward_references[], with one entry for
+		#   each token in the input; forward_references[i] gives the
+		#   token number where a token sequence starts with the same
+		#   hash value as the one starting at i
+
+	compare various files to find runs
+	delete forward reference table
+	pass2, find newline positions of found similarities
+	pass3, print the similarities
+
+
+
+pass1, read the files:
+	for each file
+		divide the text into tokens
+		store all tokens except newlines in TokenArray and try to
+			keep a record of the newline positions
+
+
+
+make forward reference table:
+	# there are two independent hash functions, hash1() and hash2().
+	#   hash1(i) gives the hash value of the token sequence starting at i
+	#   likewise for hash2(i)
+
+	set up the forward references using the last_index table:
+		# there is an array last_index[], with one entry for each
+		#   possible hash value; last_index[i] gives the position in
+		#   forward_references[] at which i was most recently
+		#   encountered as a hash value
+		for each file
+			for all positions in file except the last MinRunSize
+				set forward_references[] and update last_index[]
+
+	use hash2() to clean out matches:
+		for all tokens
+			find first token in chain with same hash2 code
+			short-circuit forward reference to it
+
+
+
+compare:
+	for all new files
+		for all texts it must be compared to
+			for all positions in the new file
+				for all positions in the text
+					for ever increasing sizes
+						try to match and keep the best
+	try to match and keep the best:
+		# using forward_references[], we find a list of positions in
+		#   which a matching token sequence will start;
+		#   scanning this list, we measure the maximum length of the
+		#   match and add the longest match to the run collection
+
+
+
+pass2, find positions of found runs:
+	for all files:
+		sort the positions in the runs
+
+		# we scan the pos list and the file in parallel
+		for all positions inside this file
+			if it matches a token position in a run
+				record line number
+
+
+
+pass3, print the similarities:
+	for all runs
+		# a run consists of two chunks
+		open the files that hold the chunks and position them
+		  at the beginning of the chunk
+		display the chunks
+

+ 70 - 0
utils/sim_pasc/add_run.c

@@ -0,0 +1,70 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: add_run.c,v 2.5 2001/11/08 12:30:28 dick Exp $
+*/
+
+#include	<malloc.h>
+
+#include	"sim.h"
+#include	"runs.h"
+#include	"percentages.h"
+#include	"options.h"
+#include	"error.h"
+#include	"add_run.h"
+
+static void set_chunk(
+	struct chunk *,
+	struct text *,
+	unsigned int,
+	unsigned int
+);
+
+static void set_pos(
+	struct position *,
+	int,
+	struct text *,
+	unsigned int
+);
+
+void
+add_run(struct text *txt0, unsigned int i0,
+	struct text *txt1, unsigned int i1,
+	unsigned int size
+) {
+	/*	Adds the run of given size to our collection.
+	*/
+	register struct run *r = (struct run *)malloc(sizeof (struct run));
+
+	if (!r) fatal("out of memory");
+	set_chunk(&r->rn_cn0, txt0, i0 - txt0->tx_start, size);
+	set_chunk(&r->rn_cn1, txt1, i1 - txt1->tx_start, size);
+	r->rn_size = size;
+
+	if (option_set('p') ? add_to_percentages(r) : add_to_runs(r)) {
+		/* OK */
+	}
+	else	fatal("out of memory");
+}
+
+static void
+set_chunk(struct chunk *cnk, struct text *txt,
+	  unsigned int start, unsigned int size
+) {
+	/*	Fill the chunk *cnk with info about the piece of text
+		in txt starting at start extending over size tokens.
+	*/
+	cnk->ch_text = txt;
+	set_pos(&cnk->ch_first, 0, txt, start);
+	set_pos(&cnk->ch_last, 1, txt, start + size - 1);
+}
+
+static void
+set_pos(struct position *pos, int type, struct text *txt, unsigned int start) {
+	/* Fill a single struct position */
+	pos->ps_next = txt->tx_pos;
+	txt->tx_pos = pos;
+
+	pos->ps_type = type;
+	pos->ps_tk_cnt = start;
+	pos->ps_nl_cnt = -1;		/* uninitialized */
+}

+ 19 - 0
utils/sim_pasc/add_run.h

@@ -0,0 +1,19 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: add_run.h,v 1.1 2001/09/28 09:03:39 dick Exp $
+*/
+
+/*	Interface between front-end and back-end: all information about
+	runs passes through add_run().  Its parameters are the two chunks,
+	each identified by their struct text and the position of the common
+	segment in TokenArray[], and the number of tokens in the common
+	segment.
+*/
+
+void add_run(
+	struct text *txt0,		/* text of first chunk */
+	unsigned int i0,		/* chunk position in TokenArray[] */
+	struct text *txt1,		/* text of second chunk */
+	unsigned int i1,		/* chunk position in TokenArray[] */
+	unsigned int size		/* number of tokens in the chunk */
+);

+ 186 - 0
utils/sim_pasc/aiso.bdy

@@ -0,0 +1,186 @@
+/*
+	Module:	Arbitrary-In Sorted-Out (AISO)
+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
+
+Description:
+	This is the body of a module that builds an arbitrary-in
+	sorted-out data structure, to be used as a heap, a priority queue, etc.
+	See aiso.spc for further info.
+*/
+
+#include	<malloc.h>
+
+static struct aiso_node *root;		/* root of tree */
+#ifdef	AISO_ITERATOR
+static struct aiso_node *list;		/* start of linked list */
+#endif	/* AISO_ITERATOR */
+
+/* the policy */
+static int aiso_size = 0;
+static int access_mark = 1;
+
+#define	add_entry()	(aiso_size++)
+#define	remove_entry()	(aiso_size--)
+#define	reset_access()	(access_mark = 1)
+#define	count_access()	(access_mark <<= 1)
+#define	must_rotate()	(access_mark > aiso_size)
+
+int
+InsertAiso(AISO_TYPE v) {
+	register struct aiso_node *new_node;
+	register struct aiso_node **hook = &root;
+#ifdef	AISO_ITERATOR
+	register struct aiso_node **prev = &list;
+#endif	/* AISO_ITERATOR */
+
+	new_node = (struct aiso_node *)malloc(sizeof (struct aiso_node));
+	if (!new_node) {
+		/* avoid modifying the tree */
+		return 0;
+	}
+
+	while (*hook) {
+		register struct aiso_node *an = *hook;
+
+		count_access();
+		if (AISO_BEFORE(v, an->an_value)) {
+			/* head left */
+			if (!an->an_left || !must_rotate()) {
+				/* standard action */
+				hook = &an->an_left;
+			}
+			else {
+				/* change (l A r) B (C) into (l) A (r B C) */
+				register struct aiso_node *anl = an->an_left;
+
+				an->an_left = anl->an_right;
+				anl->an_right = an;
+				*hook = anl;
+				reset_access();
+			}
+		}
+		else {
+			/* head right */
+			if (!an->an_right || !must_rotate()) {
+				/* standard action */
+				hook = &an->an_right;
+			}
+			else {
+				/* change (A) B (l C r) into (A B l) C (r) */
+				register struct aiso_node *anr = an->an_right;
+
+				an->an_right = anr->an_left;
+				anr->an_left = an;
+				*hook = anr;
+				reset_access();
+			}
+#ifdef	AISO_ITERATOR
+			prev = &an->an_next;
+#endif	/* AISO_ITERATOR */
+		}
+	}
+
+	new_node->an_left = 0;
+	new_node->an_right = 0;
+#ifdef	AISO_ITERATOR
+	new_node->an_next = *prev;
+	*prev = new_node;
+#endif	/* AISO_ITERATOR */
+	new_node->an_value = v;
+	*hook = new_node;
+	add_entry();
+	return 1;
+}
+
+#ifdef	AISO_EXTRACTOR
+
+int
+ExtractAiso(AISO_TYPE *vp) {
+	register struct aiso_node **hook = &root;
+	register struct aiso_node *an;
+
+	if (!root) return 0;
+
+	while ((an = *hook), an->an_left) {
+		/* head left */
+		count_access();
+		if (!must_rotate()) {
+			/* standard action */
+			hook = &an->an_left;
+		}
+		else {
+			/* change (l A r) B (C) into (l) A (r B C) */
+			register struct aiso_node *anl = an->an_left;
+
+			an->an_left = anl->an_right;
+			anl->an_right = an;
+			*hook = anl;
+			reset_access();
+		}
+	}
+	/* found the first */
+	*vp = an->an_value;
+	*hook = an->an_right;
+#ifdef	AISO_ITERATOR
+	list = an->an_next;
+#endif	/* AISO_ITERATOR */
+	free((char *)an);
+	remove_entry();
+	return 1;
+}
+
+#endif	/* AISO_EXTRACTOR */
+
+#ifdef	AISO_ITERATOR
+
+void
+OpenIter(AisoIter *ip) {
+	*ip = list;
+}
+
+int
+GetAisoItem(AisoIter *ip, AISO_TYPE *vp) {
+	register struct aiso_node *an = *ip;
+
+	if (!an) return 0;
+
+	*vp = an->an_value;
+	*ip = an->an_next;
+	return 1;
+}
+
+void
+CloseIter(AisoIter *ip) {
+	*ip = 0;
+}
+
+#endif	/* AISO_ITERATOR */
+
+#ifdef	AISO_DEBUG
+
+#include	<stdio.h>
+
+static void
+print_inf(int level, char ch, struct aiso_node *an) {
+	register int i;
+
+	if (!an) return;
+
+	print_inf(level+1, '/', an->an_right);
+	for (i = 0; i < level; i++) {
+		printf("     ");
+	}
+	printf("%c", ch);
+	printf(AISO_FORMAT, an->an_value);
+	printf("\n");
+	print_inf(level+1, '\\', an->an_left);
+}
+
+void
+PrintAisoTree(void)
+{
+	print_inf(0, '-', root);
+	printf("================\n");
+}
+
+#endif	/* AISO_DEBUG */

+ 102 - 0
utils/sim_pasc/aiso.spc

@@ -0,0 +1,102 @@
+/*
+	Module:	Arbitrary-In Sorted-Out (AISO)
+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
+	Version:	Tue Aug 23 12:54:22 1988
+
+Description:
+	This is the specification of a generic module that builds an
+	arbitrary-in sorted-out data structure, to be used as a heap, a
+	priority queue, etc. Elements can be inserted, the first element
+	extracted and the set scanned at any moment.
+
+Instantiation:
+	The module is instantiated as follows.
+	Create a file M.h for some M, which contains at least:
+	-	a definition of AISO_TYPE, the type of the object to be stored
+	-	a possible definition of AISO_EXTRACTOR; see below
+	-	a possible definition of AISO_ITERATOR; see below
+	-	#include	"aiso.spc"
+
+	This file M.h is to be included in all files that use the aiso
+	package.
+
+	Create a file M.c which contains at least:
+	-	#include	"M.h"
+	-	a definition of a routine
+			int AISO_BEFORE(AISO_TYPE v, AISO_TYPE w)
+		which yields non-zero if v is to be sorted before w
+	-	#include	"aiso.bdy"
+
+	This file compiles into the module object.
+
+Specification:
+	The module always supplies:
+	int InsertAiso(AISO_TYPE value)
+		inserts value in its proper place; fails if out of memory
+
+	If AISO_EXTRACTOR is defined, the module will also supply:
+	int ExtractAiso(AISO_TYPE *value)
+		yields the first value in the aiso and removes it;
+		fails if empty
+
+	If AISO_ITERATOR is defined, the module also supplies a type AisoIter
+	which declares an iterator, i.e., a structure that records a position
+	in the ordered set, plus routines for manipulating the iterator, thus
+	enabling the user to scan the ordered set.  The iterator should be
+	declared as:
+		AisoIter iter;
+	and is manipulated by the following commands:
+
+	void OpenIter(AisoIter *iter)
+		opens the iterator for scanning the existing set in order
+
+	int GetAisoItem(AisoIter *iter, AISO_TYPE *value)
+		yields the next value in the iterator; fails if exhausted
+
+	void CloseIter(AisoIter *iter)
+		closes the iterator
+
+	If AISO_DEBUG is defined the module will also supply:
+	void PrintAisoTree(void)
+		prints the AISO tree; requires AISO_FORMAT, to be set to
+		a format suitable to print a value of type AISO_TYPE
+
+Implementation:
+	The AISO implementation is based on a self-adjusting binary tree.
+	Degenerate behaviour of the tree is avoided by shaking the tree
+	every 'ln aiso_size' node accesses.  This guarantees ln aiso_size
+	behaviour in the long run, though it is possible for a single
+	operation to take aiso_size node accesses.
+
+	The iterator is implemented as an additional linear linked list
+	through the tree.  This is simpler than and at least as efficient as
+	clever tree-wiring.
+
+Restrictions:
+	Due to built-in fixed names, there can only be one AISO per program.
+*/
+
+struct aiso_node {
+	struct aiso_node *an_left;
+	struct aiso_node *an_right;
+#ifdef	AISO_ITERATOR
+	struct aiso_node *an_next;
+#endif	/* AISO_ITERATOR */
+	AISO_TYPE an_value;
+};
+
+extern int InsertAiso(AISO_TYPE value);
+#ifdef	AISO_EXTRACTOR
+extern int ExtractAiso(AISO_TYPE *value);
+#endif	/* AISO_EXTRACTOR */
+
+#ifdef	AISO_ITERATOR
+typedef	struct aiso_node *AisoIter;
+extern void OpenIter(AisoIter *iter);
+extern int GetAisoItem(AisoIter *iter, AISO_TYPE *value);
+extern void CloseIter(AisoIter *iter);
+#endif	/* AISO_ITERATOR */
+
+#ifdef	AISO_DEBUG
+extern void PrintAisoTree(void);
+#endif	/* AISO_ITERATOR */

+ 135 - 0
utils/sim_pasc/algollike.c

@@ -0,0 +1,135 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: algollike.c,v 2.4 2005/02/20 17:02:59 dick Exp $
+*/
+
+/*	This module implements the routines InitLanguage, MayBeStartOfRun
+	and CheckRun for ALGOL-like languages, in which it is meaningful
+	and useful to isolate function bodies.
+
+	It requires the user to define, preferably in Xlang.l, four token
+	sets, represented as TOKEN[] and terminated by NOTOKEN:
+
+	TOKEN NonFinals[]	tokens that may not end a chunk
+	TOKEN NonInitials[]	tokens that may not start a chunk
+	TOKEN Openers[]		openers of parentheses that must balance
+					in functions
+	TOKEN Closers[]		the corresponding closers, in the same order
+*/
+
+#include	"options.h"
+#include	"token.h"
+#include	"algollike.h"
+
+/*	Arrays for fast identification tests for tokens.  Each token is
+	identified by its position in the set + 1.  For example, if T is
+	the n-th Opener, openers[TOKEN2int(tk)] == n+1.
+*/
+static char non_finals[256];
+static char non_initials[256];
+static char openers[256];
+static char closers[256];
+
+static void cvt2bittable(const TOKEN *tl, char bt[256]);
+static unsigned int largest_function(const TOKEN *str, unsigned int size);
+
+void
+InitLanguage(void) {
+	/* convert the token sets to bitmaps */
+	cvt2bittable(NonFinals, non_finals);
+	cvt2bittable(NonInitials, non_initials);
+	cvt2bittable(Openers, openers);
+	cvt2bittable(Closers, closers);
+}
+
+static void
+cvt2bittable(const TOKEN *tl, char bt[256]) {
+	int i;
+	int cnt = 1;
+
+	for (i = 0; !TOKEN_EQ(tl[i], NOTOKEN); i++) {
+		bt[TOKEN2int(tl[i])] = cnt++;
+	}
+}
+
+int
+MayBeStartOfRun(TOKEN tk) {
+	return !non_initials[TOKEN2int(tk)];
+}
+
+unsigned int
+CheckRun(const TOKEN *str, unsigned int size) {
+	/*	Checks the run starting at str with length size for
+		acceptability in the language.  Cuts from the end if
+		necessary and returns the accepted length, which may
+		be zero.
+	*/
+
+	if (option_set('f')) {
+		/* reduce to a function-like form first */
+		size = largest_function(str, size);
+	}
+
+	while (	/* there is trailing garbage */
+		size != 0 && non_finals[TOKEN2int(str[size-1])]
+	) {
+		/* remove it */
+		size--;
+	}
+
+	return size;
+}
+
+static unsigned int
+largest_function(const TOKEN *str, unsigned int size) {
+	/*	Returns the size of the longest sequence starting at
+		str[0] and not containing unbalanced parentheses.
+		Does not check the nesting of the parentheses, but then,
+		sim is syntax-free anyway.
+	*/
+	register unsigned int mrb_size = 0;  /* most recent balancing size */
+	register unsigned int pos;
+	register int i;
+	int balance_count[256];
+	int n_imbalances;
+
+	/* clear administration */
+	n_imbalances = 0;
+	for (i = 0; i < 255; i++) {
+		balance_count[i] = 0;
+	}
+
+	/* scan str[] and see how far we get */
+	for (pos = 0; pos < size; pos++) {
+		register int tkval = TOKEN2int(str[pos]);
+		register int pp;		/* parenthesis position */
+
+		/* account for openers */
+		if ((pp = openers[tkval])) {
+			if (balance_count[pp] == 0) {
+				/* about to create an imbalance */
+				n_imbalances++;
+			}
+			balance_count[pp]++;
+		}
+
+		/* account for closers */
+		if ((pp = closers[tkval])) {
+			if (balance_count[pp] == 0) {
+				/* this is one Closer too many */
+				return mrb_size;
+			}
+			balance_count[pp]--;
+			if (balance_count[pp] == 0) {
+				/* we just cleared an imbalance */
+				n_imbalances--;
+			}
+		}
+
+		if (n_imbalances == 0) {
+			/* register balance point */
+			mrb_size = pos + 1;
+		}
+	}
+	return mrb_size;
+}

+ 27 - 0
utils/sim_pasc/algollike.h

@@ -0,0 +1,27 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: algollike.h,v 1.1 1997/06/20 12:03:11 dick Exp $
+*/
+
+/*	The class Algollike is a subclass of Language.  It implements
+	the routines InitLanguage, MayBeStartOfRun and CheckRun for
+	ALGOL-like languages, in which it is meaningful and useful to
+	isolate function bodies.
+
+	It requires the user to define, preferably in Xlang.l, four token
+	sets, represented as TOKEN[] and terminated by NOTOKEN:
+
+	TOKEN NonFinals[]	tokens that may not end a chunk
+	TOKEN NonInitials[]	tokens that may not start a chunk
+	TOKEN Openers[]		openers of parentheses that must balance
+					in functions
+	TOKEN Closers[]		the corresponding closers, in the same order
+*/
+
+#include	"language.h"
+#include	"token.h"
+
+extern const TOKEN NonFinals[];
+extern const TOKEN NonInitials[];
+extern const TOKEN Openers[];
+extern const TOKEN Closers[];

+ 252 - 0
utils/sim_pasc/clang.l

@@ -0,0 +1,252 @@
+%{
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: clang.l,v 2.9 2007/08/29 09:10:31 dick Exp $
+*/
+
+/*
+	C language front end for the similarity tester.
+	Author:	Dick Grune <[email protected]>
+*/
+
+#include	"options.h"
+#include	"algollike.h"
+#include	"token.h"
+#include	"idf.h"
+#include	"lex.h"
+#include	"lang.h"
+
+/* Language-dependent Code */
+
+/* Data for module idf */
+
+static const struct idf ppcmd[] = {
+	{"define",	META('d')},
+	{"else",	META('e')},
+	{"endif",	META('E')},
+	{"if",		META('i')},
+	{"ifdef",	META('I')},
+	{"ifndef",	META('x')},
+	{"include",	MTCT('I')},
+	{"line",	META('l')},
+	{"undef",	META('u')}
+};
+
+static const struct idf reserved[] = {
+	{"auto",	NORM('a')},
+	{"break",	NORM('b')},
+	{"case",	NORM('c')},
+	{"char",	NORM('C')},
+	{"continue",	CTRL('C')},
+	{"default",	NORM('d')},
+	{"do",		NORM('D')},
+	{"double",	CTRL('D')},
+	{"else",	NORM('e')},
+	{"enum",	NORM('E')},
+	{"extern",	CTRL('E')},
+	{"float",	NORM('f')},
+	{"for",		NORM('F')},
+	{"goto",	NORM('g')},
+	{"if",		NORM('i')},
+	{"int",		NORM('I')},
+	{"long",	NORM('l')},
+	{"register",	SKIP},
+	{"return",	NORM('r')},
+	{"short",	NORM('s')},
+	{"sizeof",	NORM('S')},
+	{"static",	CTRL('S')},
+	{"struct",	META('s')},
+	{"switch",	META('S')},
+	{"typedef",	NORM('t')},
+	{"union",	NORM('u')},
+	{"unsigned",	NORM('U')},
+	{"void",	SKIP},
+	{"while",	NORM('w')}
+};
+
+/* Special treatment of identifiers */
+
+static TOKEN
+idf2token(int hashing) {
+	register TOKEN tk;
+
+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
+	if (TOKEN_EQ(tk, IDF) && hashing) {
+		/* return a one-token hash code */
+		tk = idf_hashed(yytext);
+	}
+	return tk;
+}
+
+/* Token sets for module algollike */
+const TOKEN NonFinals[] = {
+	IDF,		/* identifier */
+	NORM('{'),
+	NORM('('),
+	NORM('a'),	/* auto */
+	NORM('b'),	/* break */
+	NORM('c'),	/* case */
+	NORM('C'),	/* char */
+	CTRL('C'),	/* continue */
+	NORM('d'),	/* default */
+	NORM('D'),	/* do */
+	CTRL('D'),	/* double */
+	NORM('E'),	/* enum */
+	CTRL('E'),	/* extern */
+	NORM('f'),	/* float */
+	NORM('F'),	/* for */
+	NORM('g'),	/* goto */
+	NORM('i'),	/* if */
+	NORM('I'),	/* int */
+	NORM('l'),	/* long */
+	NORM('r'),	/* return */
+	NORM('s'),	/* short */
+	CTRL('S'),	/* static */
+	META('s'),	/* struct */
+	META('S'),	/* switch */
+	NORM('t'),	/* typedef */
+	NORM('u'),	/* union */
+	NORM('U'),	/* unsigned */
+	NORM('w'),	/* while */
+	NOTOKEN
+};
+const TOKEN NonInitials[] = {
+	NORM(')'),
+	NORM('}'),
+	NORM(';'),
+	NOTOKEN
+};
+const TOKEN Openers[] = {
+	NORM('{'),
+	NORM('('),
+	NORM('['),
+	NOTOKEN
+};
+const TOKEN Closers[] = {
+	NORM('}'),
+	NORM(')'),
+	NORM(']'),
+	NOTOKEN
+};
+
+%}
+
+%option nounput
+%option never-interactive
+
+%Start	Comment
+
+Layout		([ \t\r\f])
+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
+
+AnyQuoted	(\\.)
+StrChar		([^"\n\\]|{AnyQuoted})
+ChrChar		([^'\n\\]|{AnyQuoted})
+
+StartComment	("/*")
+EndComment	("*/")
+SafeComChar	([^*\n])
+UnsafeComChar	("*")
+
+Digit		([0-9a-fA-F])
+Idf		([A-Za-z][A-Za-z0-9_]*)
+
+%%
+
+{StartComment}	{
+		/*	We do not have one single pattern to match a comment
+			(although one can be written), for two reasons.
+			The matched string might overflow lex-internal buffers
+			like yysbuf and yytext; and the pattern would be very
+			complicated and overtax lex.
+			So we break up the string into safe chunks and keep
+			track of where we are in a start condition <Comment>.
+		*/
+		BEGIN Comment;
+	}
+
+<Comment>{SafeComChar}+	{		/* safe comment chunk */
+	}
+
+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
+	}
+
+<Comment>"\n"		{		/* to break up long comments */
+		return_eol();
+	}
+
+<Comment>{EndComment}	{		/* end-of-comment */
+		BEGIN INITIAL;
+	}
+
+\"{StrChar}*\"	{			/* strings */
+		return_ch('"');
+	}
+
+\'{ChrChar}+\'	{			/* characters */
+		return_ch('\'');
+	}
+
+^#{Layout}*include.*	{		/* ignore #include lines */
+	}
+
+^#{Layout}*{Idf}	{		/* a preprocessor line */
+		register char *idf = yytext+1;
+
+		/* skip layout in front of preprocessor identifier */
+		while (*idf == ' ' || *idf == '\t') {
+			idf++;
+		}
+		return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#')));
+	}
+
+(0x)?{Digit}+("l"|"L")?	{		/* numeral, passed as an identifier */
+		return_tk(IDF);
+	}
+
+{Idf}/"("	{			/* identifier in front of ( */
+		register TOKEN tk;
+
+		tk = idf2token(option_set('F'));
+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
+	}
+
+{Idf}	{				/* identifier */
+		register TOKEN tk;
+
+		tk = idf2token(0 /* no hashing */);
+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
+	}
+
+\;	{				/* semicolon, conditionally ignored */
+		if (option_set('f')) return_ch(yytext[0]);
+	}
+
+\n	{				/* count newlines */
+		return_eol();
+	}
+
+{Layout}	{			/* ignore layout */
+	}
+
+{ASCII95}	{			/* copy other text */
+		return_ch(yytext[0]);
+	}
+
+.	{				/* count non-ASCII chars */
+		lex_non_ascii_cnt++;
+	}
+
+%%
+
+/* Language-INdependent Code */
+
+void
+yystart(void) {
+	BEGIN INITIAL;
+}
+
+int
+yywrap(void) {
+	return 1;
+}

+ 198 - 0
utils/sim_pasc/compare.c

@@ -0,0 +1,198 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: compare.c,v 2.5 2001/09/28 09:03:47 dick Exp $
+*/
+
+#include	"sim.h"
+#include	"tokenarray.h"
+#include	"hash.h"
+#include	"language.h"
+#include	"options.h"
+#include	"add_run.h"
+#include	"compare.h"
+
+static void compare1text(int, int, int);
+static unsigned int lcs(
+	struct text *, unsigned int, struct text **, unsigned int *,
+	unsigned int, unsigned int
+);
+
+/*	The overall structure of the routine Compare() is:
+
+	for all new files
+		for all texts it must be compared to
+			for all positions in the new file
+				for all positions in the text
+					for ever increasing sizes
+						try to match and keep the best
+*/
+
+void
+Compare(void) {
+	register int n;
+
+	for (n = 0; n < NumberOfNewTexts; n++) {
+		register int first =
+			(	option_set('S') ? NumberOfNewTexts + 1
+			:	option_set('s') ? n + 1
+			:	n
+			);
+
+		if (option_set('e')) {
+			/* from first to NumberOfTexts in steps */
+			register int m;
+
+			for (m = first; m < NumberOfTexts; m++) {
+				compare1text(n, m, m+1);
+			}
+		}
+		else {
+			/* from first to NumberOfTexts in one action */
+			if (first < NumberOfTexts) {
+				compare1text(n, first, NumberOfTexts);
+			}
+		}
+	}
+}
+
+static void
+compare1text(
+	int n,				/* text to be compared */
+	int first,			/* first text to be compared to */
+	int limit			/* limit text in comparison */
+) {
+	register unsigned int i_first = Text[first].tx_start;
+	register unsigned int i_limit = Text[limit-1].tx_limit;
+	register struct text *txt0 = &Text[n];
+	register unsigned int i0 = txt0->tx_start;
+
+	while (	/* there may still be a useful substring */
+		i0 + MinRunSize - 1 < txt0->tx_limit
+	) {
+		/* see if there really is one */
+		struct text *txt_best;
+		unsigned int i_best;
+		register unsigned int size_best =
+			lcs(txt0, i0, &txt_best, &i_best, i_first, i_limit);
+
+		if (size_best) {
+			/* good run found; enter it */
+			add_run(txt0, i0, txt_best, i_best, size_best);
+			/* and skip it */
+			i0 += size_best;
+		}
+		else {
+			/* we try our luck at the next token */
+			i0++;
+		}
+	}
+}
+
+static unsigned int
+lcs(	struct text *txt0,		/* input: starting position */
+	unsigned int i0,
+	struct text **tbp,		/* output: position of best run */
+	unsigned int *ibp,
+	unsigned int i_first,		/* no comparison before this pos. */
+	unsigned int i_limit		/* no comparison after this pos. */
+) {
+	/*	Finds the longest common substring (not -sequence) in:
+			txt0, starting precisely at i0 and
+			the text between i_first and i_limit.
+		Writes the position in tbp and ibp and returns the size.
+		Returns 0 if no common substring is found.
+	*/
+	register struct text *txt1 = txt0;
+	register unsigned int i1 = i0;
+	register unsigned int size_best = 0;
+	register unsigned int txt0limit = txt0->tx_limit;
+	register unsigned int txt1limit = txt1->tx_limit;
+
+	while (	/* there is a next opportunity */
+		(i1 = ForwardReference(i1))
+	&&	/* it is still in range */
+		i1 < i_limit
+	) {
+		register unsigned int min_size;
+		register unsigned int new_size;
+		register unsigned int j0;
+		register unsigned int j1;
+
+		if (i1 < i_first) {	/* not in range */
+			continue;
+		}
+
+		/* bump txt1; we may have skipped a text or two */
+		while (i1 >= txt1->tx_limit) {
+			txt1++;
+		}
+		txt1limit = txt1->tx_limit;
+
+		min_size = (size_best ? size_best+1 : MinRunSize);
+		/* are we looking at something better than we have got? */
+		{
+			j0 = i0 + min_size - 1;
+			j1 = i1 + min_size - 1;
+			if (	/* j0 still inside txt0 */
+				j0 < txt0limit
+			&&	/* j1 still inside txt1 */
+				j1 < txt1limit
+			&&	/* j0 and j1 don't overlap */
+				j0 < j1 - min_size + 1
+			) {
+				/* there would be room enough */
+				register int cnt = min_size;
+
+				/* does the text match? */
+				while (	cnt
+				&&	TOKEN_EQ(TokenArray[j0], TokenArray[j1])
+				) {
+					cnt--, j0--, j1--;
+				}
+				if (cnt) continue;	/* forget it */
+			}
+			else continue;			/* forget it */
+		}
+
+		/* yes, we are; how long can we make it? */
+		{
+			register unsigned int size = min_size;
+
+			j0 = i0 + min_size;
+			j1 = i1 + min_size;
+			while (	/* j0 still inside txt0 */
+				j0 < txt0limit
+			&&	/* j1 still inside txt1 */
+				j1 < txt1limit
+			&&	/* j0 and j1 don't overlap */
+				j0 + size < j1
+			&&	/* tokens are the same */
+				TOKEN_EQ(TokenArray[j0], TokenArray[j1])
+			) {
+				j0++, j1++, size++;
+			}
+			new_size = size;
+		}
+
+		/*	offer the run to the Language Department which may
+			reject it or may cut its tail
+		*/
+		new_size = (	MayBeStartOfRun(TokenArray[i0])
+			   ?	CheckRun(&TokenArray[i0], new_size)
+			   :	0
+			   );
+
+		if (	/* we still have something acceptable */
+			new_size >= MinRunSize
+		&&	/* it is better still than what we had */
+			new_size > size_best
+		) {
+			/* record it */
+			*tbp = txt1;
+			*ibp = i1;
+			size_best = new_size;
+		}
+	}
+
+	return size_best;
+}

+ 11 - 0
utils/sim_pasc/compare.h

@@ -0,0 +1,11 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: compare.h,v 1.2 1998/01/21 14:27:47 dick Exp $
+*/
+
+/*	Compares each new text to the appropriate texts.
+	Stores the runs found in the AISO heap.
+	Runs contain references to positions in the input files.
+*/
+
+extern void Compare(void);

+ 20 - 0
utils/sim_pasc/debug.par

@@ -0,0 +1,20 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: debug.par,v 1.3 1998/02/03 14:28:21 dick Exp $
+*/
+
+#undef	DB_FORW_REF			/* print & check forward references */
+#undef	DB_TEXT				/* print all text parts */
+#undef	DB_POS				/* print positions in files */
+#undef	DB_NL_BUFF			/* print the newline count buffer */
+#undef	DB_RUN				/* print all identified runs */
+
+#ifdef	lint
+
+#define	DB_FORW_REF
+#define	DB_TEXT
+#define	DB_POS
+#define	DB_NL_BUFF
+#define	DB_RUN
+
+#endif	/* lint */

+ 16 - 0
utils/sim_pasc/error.c

@@ -0,0 +1,16 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: error.c,v 2.4 1998/02/03 14:28:22 dick Exp $
+*/
+
+#include	<stdio.h>
+#include	<stdlib.h>
+
+#include	"sim.h"
+#include	"error.h"
+
+void
+fatal(const char *msg) {
+	fprintf(stderr, "%s: %s\n", progname, msg);
+	exit(1);
+}

+ 6 - 0
utils/sim_pasc/error.h

@@ -0,0 +1,6 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: error.h,v 1.3 1998/02/03 14:28:23 dick Exp $
+*/
+
+extern void fatal(const char *msg);

+ 386 - 0
utils/sim_pasc/hash.c

@@ -0,0 +1,386 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: hash.c,v 2.8 2005/02/20 17:03:00 dick Exp $
+*/
+
+/*	Text is compared by comparing every substring to all substrings
+	to the right of it; this process is in essence quadratic.  However,
+	only substrings of length at least 'MinRunSize' are of interest,
+	which gives us the possibility to speed up this process by using
+	a hash table.
+
+	For every position in the text, we construct an index which gives
+	the next position in the text at which a run of MinRunSize tokens
+	starts that has the same hash code, as calculated by hash1().  If
+	there is no such run, the index is 0.  These forward references are
+	kept in the array forward_references[].
+
+	To construct this array, we use a hash table last_index[] whose size
+	is a prime and which is about 8 times smaller than the text array.
+	The hash table last_index[] is set up such that last_index[i] is the
+	index of the latest token with hash_code i, or 0 if there is none.
+	This results in hash chains of an average length of 8.  See
+	MakeForwardReferences().
+
+	If there is not enough room for a hash table of the proper size
+	(which can be considerable) the hashing is not efficient any more.
+	In that case, the forward reference table is scanned a second time,
+	eliminating from any chain all references to runs that do not hash to
+	the same value under a second hash function, hash2().  For the UNIX
+	manuals this reduced the number of matches from 91.9% to 1.9% (of
+	which 0.06% was genuine).
+*/
+
+#include	<stdio.h>
+#include	<malloc.h>
+
+#include	"system.par"
+#include	"debug.par"
+#include	"sim.h"
+#include	"error.h"
+#include	"language.h"
+#include	"token.h"
+#include	"tokenarray.h"
+#include	"options.h"
+#include	"hash.h"
+
+							/* MAIN ENTRIES */
+static unsigned int *forward_references;	/* to be filled by malloc() */
+static int n_forward_references;
+
+static void make_forward_references_hash1(void);
+static void make_forward_references_hash2(void);
+
+#ifdef	DB_FORW_REF
+static void db_forward_references(const char *);
+static void make_forward_references_hash3(void);
+#endif
+
+void
+MakeForwardReferences(void) {
+	/*	Constructs the forward references table.
+	*/
+
+	n_forward_references = TextLength();
+	forward_references =
+		(unsigned int *)calloc(
+			n_forward_references, sizeof (unsigned int)
+		);
+	if (!forward_references) {
+		fatal("out of memory");
+	}
+	make_forward_references_hash1();
+	make_forward_references_hash2();
+#ifdef	DB_FORW_REF
+	make_forward_references_hash3();
+#endif
+}
+
+unsigned int
+ForwardReference(int i) {
+	if (i <= 0 || i >= n_forward_references) {
+		fatal("internal error, bad forward reference");
+	}
+	return forward_references[i];
+}
+
+void
+FreeForwardReferences(void) {
+	free((char *)forward_references);
+}
+
+							/* HASHING */
+/*
+	We want a hash function whose time cost does not depend on
+	MinRunSize, which is a problem since the size of the value
+	we derive the hash function from IS equal to MinRunSize!
+	Therefore we base the hash function on a sample of at most 24
+	tokens from the input string; this works at least as well in
+	practice.  These 24 token values will result in exactly 31
+	bits under the hashing algorithm used, which avoids an
+	overflow test.  So this 24 bears no relation to the default
+	run size (although the fit is surprising!)
+*/
+
+#define	N_SAMPLES	24
+#define	OPERATION	^
+
+/*	An alternative algorithm; does not seem to make any difference.
+#define	N_SAMPLES	23
+#define	OPERATION	+
+*/
+
+/*	Another algorithm; not yet tested
+#define	N_SAMPLES	24
+#define	OPERATION	+ 613 *
+*/
+
+static unsigned int *last_index;
+static unsigned int hash_table_size;
+static int sample_pos[N_SAMPLES];
+
+static unsigned int
+prime[] = {		/* lots of hopefully suitable primes */
+	10639,
+	21283,
+	42571,
+	85147,
+	170227,
+	340451,
+	680959,
+	1361803,
+	2723599,
+	5447171,
+	10894379,
+	21788719,
+	43577399,
+	87154759,
+	174309383,
+	348618827,
+	697237511,
+	1394475011
+};
+
+static void
+init_hash_table(void) {
+	register int n;
+
+	/* find the ideal hash table size */
+	n = 0;
+	while (prime[n] < TextLength()) {
+		n++;
+		/* this will always terminate, if prime[] is large enough */
+	}
+
+	/* see if we can allocate that much space, and if not, step down */
+	last_index = 0;
+	while (!last_index && n >= 0) {
+		hash_table_size = prime[n];
+		last_index = (unsigned int *)
+			calloc(hash_table_size, sizeof (unsigned int));
+		n--;
+	}
+	if (!last_index) {
+		fatal("out of memory");
+	}
+	
+	/* find sample positions */
+	for (n = 0; n < N_SAMPLES; n++) {
+		/* straigh-line approximation; uninituitive as usual */
+		sample_pos[n] = (
+			(2 * n * (MinRunSize - 1) + (N_SAMPLES - 1))
+		/	(2 * (N_SAMPLES - 1))
+		);
+	}
+}
+
+static int hash1(const TOKEN *);
+
+static void
+make_forward_references_hash1(void) {
+	register int n;
+
+	init_hash_table();
+
+	/* set up the forward references using the last_index hash table */
+	for (n = 0; n < NumberOfTexts; n++) {
+		register struct text *txt = &Text[n];
+		register unsigned int j;
+
+		for (	/* all pos'ns in txt except the last MinRunSize-1 */
+			j = txt->tx_start;			/* >= 1 */
+			j + MinRunSize - 1 < txt->tx_limit;
+			j++
+		) {
+			if (MayBeStartOfRun(TokenArray[j])) {
+				register int h = hash1(&TokenArray[j]);
+
+				if (last_index[h]) {
+					forward_references[last_index[h]] = j;
+				}
+				last_index[h] = j;
+			}
+		}
+	}
+	free((char *)last_index);
+
+#ifdef	DB_FORW_REF
+	db_forward_references("first hashing");
+#endif	/* DB_FORW_REF */
+}
+
+static int
+hash1(const TOKEN *p) {
+	/*	hash1(p) returns the hash code of the MinRunSize
+		tokens starting at p; caller guarantees that there
+		are at least MinRunSize tokens.
+	*/
+	register int32 h_val;
+	register int n;
+	
+	h_val = 0;
+	for (n = 0; n < N_SAMPLES; n++) {
+		h_val = (h_val << 1) OPERATION TOKEN2int(p[sample_pos[n]]);
+#if	N_SAMPLES > 24
+		if (h_val & (1<<31)) {
+			h_val ^= (1<<31|1);
+		}
+#endif
+	}
+	/* just in case somebody tries wrong N_SAMPLES and OPERATION values: */
+	if (h_val < 0) fatal("corrupt hash algorithm in hash1() in hash.c");
+
+	return h_val % hash_table_size;
+}
+
+static int hash2(const TOKEN *);
+
+static void
+make_forward_references_hash2(void) {
+	register unsigned int i;
+
+	/* do a second hash only if the original hash table was reduced */
+	/*	Meanwhile, the quality of the primary hashing is so bad
+		that we are virtually forced to always do a second scan.
+	*/
+
+	/*	Clean out spurious matches, by a quadratic algorithm.
+		Note that we do not want to eliminate overlapping
+		sequences in this stage, since we might be removing the
+		wrong copy.
+	*/
+	for (i = 0; i+MinRunSize < TextLength(); i++) {
+		register unsigned int j = i;
+		register int h2 = hash2(&TokenArray[i]);
+
+		/*	Find the first token sequence in the chain
+			with same secondary hash code.
+		*/
+		while (	/* there is still a forward reference */
+			(j = forward_references[j])
+		&&	/* its hash code does not match */
+			hash2(&TokenArray[j]) != h2
+		) {
+			/* continue searching */
+		}
+		/* short-circuit forward reference to it, or to zero */
+		forward_references[i] = j;
+	}
+
+#ifdef	DB_FORW_REF
+	db_forward_references("second hashing");
+#endif	/* DB_FORW_REF */
+}
+
+static int
+hash2(const TOKEN *p) {
+	/*	a simple-minded hashing for the secondary sweep;
+		first and last token combined in a short int
+	*/
+	return (TOKEN2int(p[0]) << 8) + TOKEN2int(p[MinRunSize-1]);
+}
+
+#ifdef	DB_FORW_REF
+
+static int hash3(const TOKEN *, const TOKEN *);
+
+static void
+make_forward_references_hash3(void) {
+	register unsigned int i;
+
+	/* do a third hash to check up on the previous two */
+
+	/* this time we use a genuine compare */
+	for (i = 0; i+MinRunSize < TextLength(); i++) {
+		register unsigned int j = i;
+
+		while (	/* there is still a forward reference */
+			(j = forward_references[j])
+		&&	/* its hash code does not match */
+			!hash3(&TokenArray[i], &TokenArray[j])
+		) {
+			/* continue searching */
+		}
+		/* short-circuit forward reference to it, or to zero */
+		forward_references[i] = j;
+	}
+
+	db_forward_references("third hashing");
+}
+
+static int
+hash3(const TOKEN *p, const TOKEN *q) {
+	/* a full comparison for the tertiary sweep */
+	int n;
+	
+	for (n = 0; n < MinRunSize; n++) {
+		if (TOKEN2int(*(p+n)) != TOKEN2int(*(q+n))) return 0;
+	}
+	return 1;
+}
+
+static int
+db_frw_chain(int n, char *crossed_out) {
+	register int chain_len = -1;
+		/* if there are two values, the chain length is still 1 */
+	register int fw;
+
+	for (fw = n; fw; fw = forward_references[fw]) {
+		if (crossed_out[fw]) {
+			fprintf(DebugFile,
+				">>>> error in forward_references[] <<<<\n"
+			);
+		}
+		chain_len++;
+		crossed_out[fw]++;
+	}
+	fprintf(DebugFile, "n = %d, chain_len = %d\n", n, chain_len);
+	
+	return chain_len;
+}
+
+static void
+db_forward_references(const char *msg) {
+	int n;
+	int n_frw_chains = 0;		/* number of forward ref. chains */
+	int tot_frwc_len = 0;
+	char *crossed_out;
+
+	fprintf(DebugFile, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg);
+	fprintf(DebugFile, "hash_table_size = %u\n", hash_table_size);
+	fprintf(DebugFile, "N_SAMPLES = %d\n", N_SAMPLES);
+
+	crossed_out = (char *)calloc(TextLength(), sizeof (char));
+	if (!crossed_out) {
+		fatal(">>>> no room for db_forward_references debug table <<<<\n");
+	}
+
+	/*	Each forward_references[n] starts in principle a new
+		chain, and these chains never touch each other.
+		We check this property by marking the positions in each
+		chain in an array; if we meet a marked entry while
+		following a chain, it must have been on an earlier chain
+		and we have an error.
+		We also determine the lengths of the chains, for statistics.
+	*/
+	if (forward_references[0]) {
+		fprintf(DebugFile,
+			">>>> forward_references[0] is not zero <<<<\n"
+		);
+	}
+	for (n = 1; n < TextLength(); n++) {
+		if (forward_references[n] && !crossed_out[n]) {
+			/* start of a new chain */
+			n_frw_chains++;
+			tot_frwc_len += db_frw_chain(n, crossed_out);
+		}
+	}
+	free((char *)crossed_out);
+
+	fprintf(DebugFile,
+		"text length = %u, # forward chains = %d, total frw chain length = %d\n\n",
+		TextLength(), n_frw_chains, tot_frwc_len
+	);
+}
+
+#endif	/* DB_FORW_REF */

+ 12 - 0
utils/sim_pasc/hash.h

@@ -0,0 +1,12 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: hash.h,v 1.1 1997/06/20 12:03:14 dick Exp $
+*/
+
+/*	Creating and consulting the ForwardReference array; to speed up
+	the Longest Substring Allgorithm.
+*/
+
+extern void MakeForwardReferences(void);
+extern void FreeForwardReferences(void);
+extern unsigned int ForwardReference(int i);

+ 67 - 0
utils/sim_pasc/idf.c

@@ -0,0 +1,67 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: idf.c,v 2.8 2005/02/20 17:03:00 dick Exp $
+*/
+
+#include	<string.h>
+
+#include	"system.par"
+#include	"token.h"
+#include	"idf.h"
+
+TOKEN
+idf_in_list(
+	const char *str,
+	const struct idf list[],
+	unsigned int listsize,
+	TOKEN dflt
+) {
+	register int first = 0;
+	register int last = (listsize / sizeof (struct idf)) - 1;
+
+	while (first < last) {
+		register int middle = (first + last) / 2;
+
+		if (strcmp(str, list[middle].id_tag) > 0) {
+			first = middle + 1;
+		}
+		else {
+			last = middle;
+		}
+	}
+	return (strcmp(str, list[first].id_tag) == 0
+	?	list[first].id_tr
+	:	dflt
+	);
+}
+
+TOKEN
+idf_hashed(const char *str) {
+	register int32 h = 0;
+
+	/* let's be careful about ranges; if done wrong it's hard to debug */
+	while (*str) {
+		/* -1 <= h <= 2^31-1 */
+		h = (h << 1) + (*str++&0377);
+		/* -2^31 <= h <= 2^31-1 */
+		if (h < 0) {
+			/* -2^31 <= h <= -1 */
+			h += 2147483647;	/* 2^31-1 */
+			/* -1 <= h <= 2^31-2 */
+		}
+		else {
+			/* 0 <= h <= 2^31-1 */
+		}
+		/* -1 <= h <= 2^31-1 */
+	}
+	/* -1 <= h <= 2^31-1 */
+	if (h < 0) {
+		/* h = -1 */
+		/* a very small chance, but all the same */
+		h = 0;
+	}
+	/* 0 <= h <= 2^31-1 */
+	h %= 253;				/* 0 <= h < 253 */
+	return NORM(h + 1);			/* 1 <= h < 254 */
+	/* this avoids SKIP (0) and EOL (255) */
+}

+ 31 - 0
utils/sim_pasc/idf.h

@@ -0,0 +1,31 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: idf.h,v 2.5 1998/02/03 14:28:25 dick Exp $
+*/
+
+/*	Idf module:
+	TOKEN idf_in_list(char *str, struct idf l[], sizeof l, TOKEN dflt);
+		looks up a keyword in a list of keywords l, represented as an
+		array of struct idf, and returns its translation as a token;
+		dflt is returned if the keyword is not found.
+	TOKEN idf_hashed(char *str);
+		returns a token unequal to SKIP or EOL, derived from the str
+		through hashing
+	It is assumed that SKIP will be ignored by the user of this module.
+*/
+
+#include	"token.h"
+
+/* the struct for keywords etc. */
+struct idf {
+	char *id_tag;	/* an interesting identifier */
+	TOKEN id_tr;	/* with its one-token translation */
+};
+
+/* special tokens for the idf module */
+#define	SKIP		NORM('\0')
+#define	IDF		NORM('@')
+
+/* public functions */
+extern TOKEN idf_in_list(const char *, const struct idf [], unsigned int, TOKEN);
+extern TOKEN idf_hashed(const char *);

+ 270 - 0
utils/sim_pasc/javalang.l

@@ -0,0 +1,270 @@
+%{
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: javalang.l,v 1.4 2007/08/29 09:10:32 dick Exp $
+*/
+
+/*
+	Java language front end for the similarity tester.
+	Author:	Dick Grune <[email protected]>
+*/
+
+#include	"options.h"
+#include	"algollike.h"
+#include	"token.h"
+#include	"idf.h"
+#include	"lex.h"
+#include	"lang.h"
+
+/* Language-dependent Code */
+
+static const struct idf reserved[] = {
+	{"abstract",	NORM('a')},
+	{"boolean",	NORM('b')},
+	{"break",	NORM('B')},
+	{"byte",	CTRL('B')},
+	{"case",	NORM('c')},
+	{"catch",	NORM('C')},
+	{"char",	CTRL('C')},
+	{"class",	META('c')},
+	{"continue",	META('C')},
+	{"default",	NORM('d')},
+	{"do",		NORM('D')},
+	{"double",	CTRL('D')},
+	{"else",	NORM('e')},
+	{"extends",	NORM('E')},
+	{"false",	NORM('g')},	/* Boolean literal */
+	{"final",	NORM('f')},
+	{"finally",	NORM('F')},
+	{"float",	CTRL('F')},
+	{"for",		META('f')},
+	{"if",		NORM('i')},
+	{"implements",	NORM('I')},
+	{"import",	CTRL('I')},
+	{"instanceof",	META('i')},
+	{"int",		META('I')},
+	{"interface",	MTCT('I')},
+	{"long",	NORM('l')},
+	{"native",	NORM('n')},
+	{"new",		NORM('N')},
+	{"null",	CTRL('N')},	/* null literal */
+	{"package",	NORM('p')},
+	{"private",	NORM('P')},
+	{"protected",	CTRL('P')},
+	{"public",	META('p')},
+	{"return",	NORM('r')},
+	{"short",	NORM('s')},
+	{"static",	NORM('S')},
+	{"super",	CTRL('S')},
+	{"switch",	META('s')},
+	{"synchronized",META('S')},
+	{"this",	NORM('t')},
+	{"throw",	NORM('T')},
+	{"throws",	CTRL('T')},
+	{"true",	META('t')},	/* Boolean literal */
+	{"void",	NORM('v')},
+	{"volatile",	NORM('V')},
+	{"while",	NORM('w')}
+};
+
+/* Special treatment of identifiers */
+
+static TOKEN
+idf2token(int hashing) {
+	register TOKEN tk;
+
+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
+	if (TOKEN_EQ(tk, IDF) && hashing) {
+		/* return a one-token hash code */
+		tk = idf_hashed(yytext);
+	}
+	return tk;
+}
+
+/* Token sets for module algollike */
+const TOKEN NonFinals[] = {
+	IDF,		/* identifier */
+	NORM('{'),
+	NORM('('),
+	NORM('a'),	/* abstract */
+	NORM('b'),	/* boolean */
+	NORM('B'),	/* break */
+	CTRL('B'),	/* byte */
+	NORM('c'),	/* case */
+	NORM('C'),	/* catch */
+	CTRL('C'),	/* char */
+	META('c'),	/* class */
+	META('C'),	/* continue */
+	NORM('d'),	/* default */
+	NORM('D'),	/* do */
+	CTRL('D'),	/* double */
+	NORM('e'),	/* else */
+	NORM('E'),	/* extends */
+	NORM('f'),	/* final */
+	NORM('F'),	/* finally */
+	CTRL('F'),	/* float */
+	META('f'),	/* for */
+	NORM('i'),	/* if */
+	NORM('I'),	/* implements */
+	CTRL('I'),	/* import */
+	META('i'),	/* instanceof */
+	META('I'),	/* int */
+	MTCT('I'),	/* interface */
+	NORM('l'),	/* long */
+	NORM('n'),	/* native */
+	NORM('N'),	/* new */
+	NORM('p'),	/* package */
+	NORM('P'),	/* private */
+	CTRL('P'),	/* protected */
+	META('p'),	/* public */
+	NORM('r'),	/* return */
+	NORM('s'),	/* short */
+	NORM('S'),	/* static */
+	CTRL('S'),	/* super */
+	META('s'),	/* switch */
+	META('S'),	/* synchronized */
+	NORM('T'),	/* throw */
+	CTRL('T'),	/* throws */
+	NORM('v'),	/* void */
+	NORM('V'),	/* volatile */
+	NORM('w'),	/* while */
+	NOTOKEN
+};
+const TOKEN NonInitials[] = {
+	NORM(')'),
+	NORM('}'),
+	NORM(';'),
+	NOTOKEN
+};
+const TOKEN Openers[] = {
+	NORM('{'),
+	NORM('('),
+	NORM('['),
+	NOTOKEN
+};
+const TOKEN Closers[] = {
+	NORM('}'),
+	NORM(')'),
+	NORM(']'),
+	NOTOKEN
+};
+
+%}
+
+%option nounput
+%option never-interactive
+
+%Start	Comment
+
+Layout		([ \t\r\f])
+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
+
+Digit		([0-9a-fA-F])
+
+UniCode		(\\u{Digit}{Digit}{Digit}{Digit})
+AnyQuoted	((\\.)|{UniCode})
+StrChar		([^"\n\\]|{AnyQuoted})
+ChrChar		([^'\n\\]|{AnyQuoted})
+
+StartComment	("/*")
+EndComment	("*/")
+SafeComChar	([^*\n])
+UnsafeComChar	("*")
+
+SingleLineCom	("//".*)
+
+Idf		([A-Za-z][A-Za-z0-9_]*)
+
+%%
+
+{StartComment}	{
+		/*	We do not have one single pattern to match a comment
+			(although one can be written), for two reasons.
+			The matched string might overflow lex-internal buffers
+			like yysbuf and yytext; and the pattern would be very
+			complicated and overtax lex.
+			So we break up the string into safe chunks and keep
+			track of where we are in a start condition <Comment>.
+		*/
+		BEGIN Comment;
+	}
+
+<Comment>{SafeComChar}+	{		/* safe comment chunk */
+	}
+
+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
+	}
+
+<Comment>"\n"		{		/* to break up long comments */
+		return_eol();
+	}
+
+<Comment>{EndComment}	{		/* end-of-comment */
+		BEGIN INITIAL;
+	}
+
+{SingleLineCom}"\n"	{		/* single-line comment */
+		return_eol();
+	}
+
+\"{StrChar}*\"	{			/* strings */
+		return_ch('"');
+	}
+
+\'{ChrChar}+\'	{			/* characters */
+		return_ch('\'');
+	}
+
+(0x)?{Digit}+("l"|"L")?	{		/* numeral, passed as an identifier */
+		return_tk(IDF);
+	}
+
+"import"{Layout}[^;]*;	{		/* import statement; ignore */
+	}
+
+{Idf}/"("	{			/* identifier in front of ( */
+		register TOKEN tk;
+
+		tk = idf2token(option_set('F'));
+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
+	}
+
+{Idf}	{				/* identifier */
+		register TOKEN tk;
+
+		tk = idf2token(0 /* no hashing */);
+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
+	}
+
+\;	{				/* semicolon, conditionally ignored */
+		if (option_set('f')) return_ch(yytext[0]);
+	}
+
+\n	{				/* count newlines */
+		return_eol();
+	}
+
+{Layout}	{			/* ignore layout */
+	}
+
+{ASCII95}	{			/* copy other text */
+		return_ch(yytext[0]);
+	}
+
+.	{				/* count non-ASCII chars */
+		lex_non_ascii_cnt++;
+	}
+
+%%
+
+/* Language-INdependent Code */
+
+void
+yystart(void) {
+	BEGIN INITIAL;
+}
+
+int
+yywrap(void) {
+	return 1;
+}

+ 32 - 0
utils/sim_pasc/lang.h

@@ -0,0 +1,32 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: lang.h,v 1.2 1998/01/21 14:27:51 dick Exp $
+*/
+
+/*
+	The token-providing module 'lang' has three interfaces:
+	-	lang.h, which provides access to the lowest-level token
+			routines, to be used by the next level.
+	-	lex.h, which provides the lex variables, to be used by
+			all and sundry.
+	-	language.h, which provides language-specific info about
+			tokens, concerning their suitability as initial
+			and final tokens, to be used by higher levels.
+			
+	This structure is not satisfactory, but it is also unreasonable
+	to combine them in one interface.
+
+	There is no single lang.c; rather it is represented by the
+	various Xlang.c files generated from the Xlang.l files.
+*/
+
+#include	"token.h"
+
+/* useful macros */
+#define	return_tk(tk)	{lex_tk_cnt++; lex_token = (tk); return 1;}
+#define	return_ch(ch)	{lex_tk_cnt++; lex_token = int2TOKEN((int)(ch)); return 1;}
+#define	return_eol()	{lex_nl_cnt++; lex_token = EOL; return 1;}
+
+extern int yylex(void);
+extern void yystart(void);
+extern FILE *yyin;

+ 17 - 0
utils/sim_pasc/language.h

@@ -0,0 +1,17 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: language.h,v 1.1 1997/06/20 12:03:15 dick Exp $
+*/
+
+/*	The abstract class Language contains the routines InitLanguage,
+	MayBeStartOfRun and CheckRun which describe in some sense the
+	language and which are required by compare.c.
+	
+	These routines must be provided by all Xlang.l files.
+*/
+
+#include	"token.h"
+
+extern void InitLanguage(void);
+extern int MayBeStartOfRun(TOKEN ch);
+extern unsigned int CheckRun(const TOKEN *str, unsigned int size);

+ 16 - 0
utils/sim_pasc/lex.c

@@ -0,0 +1,16 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: lex.c,v 1.3 1998/02/03 14:28:26 dick Exp $
+*/
+
+/*	The communication variables, as set by yylex, NextStreamTokenObtained
+	and NextTextTokenObtained.
+*/
+
+#include	"token.h"
+#include	"lex.h"
+
+TOKEN lex_token;			/* token produced, or EOL */
+unsigned int lex_nl_cnt;		/* line count */
+unsigned int lex_tk_cnt;		/* token position */
+unsigned int lex_non_ascii_cnt;		/* # of non-ASCII chars found */

+ 19 - 0
utils/sim_pasc/lex.h

@@ -0,0 +1,19 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: lex.h,v 2.5 1998/02/03 14:28:27 dick Exp $
+*/
+
+/*	Since the lex_X variables are hoisted unchanged through the levels
+	lang, stream, and buff, to be used by pass1, pass2, etc., they
+	have to be placed in a module of their own.
+*/
+
+#include	"token.h"
+
+/* special tokens */
+#define	EOL		NORM(0377)	/* end of line */
+
+extern TOKEN lex_token;			/* token produced, or EOL */
+extern unsigned int lex_nl_cnt;		/* line count */
+extern unsigned int lex_tk_cnt;		/* token position */
+extern unsigned int lex_non_ascii_cnt;	/* # of non-ASCII chars found */

+ 123 - 0
utils/sim_pasc/lisplang.l

@@ -0,0 +1,123 @@
+%{
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: lisplang.l,v 2.9 2007/08/29 09:10:33 dick Exp $
+*/
+
+/*
+	LISP language front end for the similarity tester.
+	Author:	Gertjan Akkerman <[email protected]>
+	Date:	Thu, 9 Apr 87 11:15:23 MDT
+*/
+
+#include	"language.h"
+#include	"token.h"
+#include	"lex.h"
+#include	"lang.h"
+
+/* Language-dependent Code */
+#include	"idf.h"
+
+static const struct idf reserved[] = {
+	{"append",	NORM('a')},
+	{"append1",	NORM('b')},
+	{"atom",	NORM('t')},
+	{"car",		NORM('h')},
+	{"cdr",		NORM('t')},
+	{"cond",	NORM('c')},
+	{"cons",	NORM('s')},
+	{"defun",	NORM('u')},
+	{"do",		NORM('d')},
+	{"eq",		NORM('e')},
+	{"equal",	NORM('e')},		/* See eq */
+	{"for",		NORM('f')},
+	{"if",		NORM('i')},
+	{"list",	NORM('l')},
+	{"nconc",	NORM('n')},
+	{"rplaca",	NORM('A')},
+	{"rplacd",	NORM('D')}
+};
+
+/* Token sets for module algollike */
+const TOKEN NonFinals[] = {
+	NORM('('),
+	NORM('['),
+	NOTOKEN
+};
+const TOKEN NonInitials[] = {
+	NORM(')'),
+	NORM(']'),
+	NOTOKEN
+};
+const TOKEN Openers[] = {
+	NORM('('),
+	NORM('['),
+	NOTOKEN
+};
+const TOKEN Closers[] = {
+	NORM(')'),
+	NORM(']'),
+	NOTOKEN
+};
+
+%}
+
+%option nounput
+%option never-interactive
+
+%Start	Comment
+
+Layout		([ \t\r\f])
+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
+
+AnyQuoted	(\\.)
+StrChar		([^"\n\\]|{AnyQuoted})
+ChrChar		([^'\\]|{AnyQuoted})
+
+IdfChar		([-!#$%&*+,/0-9:;<=>?@A-Z\\^_`a-z{}~])
+
+EscIdf		(({IdfChar}|\\.)+)
+QuotIdf		("|"[^\|\n]*"|")
+Idf		({EscIdf}|{QuotIdf})
+
+%%
+
+";".*$	{				/* comment */
+	}
+
+\"{StrChar}*\"	{			/* strings */
+		return_ch('"');
+	}
+
+{Idf}	{				/* identifier */
+		return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF));
+	}
+
+\n	{				/* count newlines */
+		return_eol();
+	}
+
+{Layout}	{			/* ignore layout */
+	}
+
+{ASCII95}	{			/* copy other text */
+		return_ch(yytext[0]);
+	}
+
+.	{				/* count non-ASCII chars */
+		lex_non_ascii_cnt++;
+	}
+
+%%
+
+/* Language-INdependent Code */
+
+void
+yystart(void) {
+	BEGIN INITIAL;
+}
+
+int
+yywrap(void) {
+	return 1;
+}

+ 319 - 0
utils/sim_pasc/m2lang.l

@@ -0,0 +1,319 @@
+%{
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: m2lang.l,v 2.9 2007/08/29 09:10:33 dick Exp $
+*/
+
+/*
+	Modula-2 language front end for the similarity tester.
+	Author:	Dick Grune <[email protected]>
+*/
+
+#include	"options.h"
+#include	"algollike.h"
+#include	"token.h"
+#include	"idf.h"
+#include	"lex.h"
+#include	"lang.h"
+
+/* Language-dependent Code */
+
+/*	Most Modula-2 programs start with a number of IMPORTs that look
+	very similar from program to program.  These are skipped by ignoring
+	the reserved words IMPLEMENTATION, DEFINITION, MODULE, IMPORT
+	and FROM, having a flag skip_imports, and start reacting only
+	at the first non-ignored reserved word.
+
+	Also, the nesting comments require a state variable.
+*/
+
+/* Additional state variables, set in yystart() */
+static int skip_imports;
+static int comment_level;
+
+/* Data for module idf */
+
+static const struct idf reserved[] = {
+	{"AND",		NORM('&')},
+	{"ARRAY",	NORM('A')},
+	{"BEGIN",	NORM('{')},
+	{"BY",		NORM('B')},
+	{"CASE",	NORM('c')},
+	{"CONST",	NORM('C')},
+	{"DEFINITION",	SKIP},
+	{"DIV",		NORM('/')},
+	{"DO",		NORM('D')},
+	{"ELSE",	NORM('e')},
+	{"ELSIF",	NORM('e')},
+	{"END",		NORM('}')},
+	{"EXIT",	NORM('E')},
+	{"EXPORT",	CTRL('E')},
+	{"FOR",		NORM('F')},
+	{"FROM",	SKIP},
+	{"IF",		NORM('i')},
+	{"IMPLEMENTATION", SKIP},
+	{"IMPORT",	SKIP},
+	{"IN",		NORM('I')},
+	{"LOOP",	NORM('l')},
+	{"MOD",		NORM('%')},
+	{"MODULE",	SKIP},
+	{"NOT",		NORM('~')},
+	{"OF",		SKIP},
+	{"OR",		NORM('O')},
+	{"POINTER",	NORM('p')},
+	{"PROCEDURE",	NORM('P')},
+	{"QUALIFIED",	NORM('q')},
+	{"RECORD",	NORM('r')},
+	{"REPEAT",	NORM('R')},
+	{"RETURN",	CTRL('r')},
+	{"SET",		NORM('s')},
+	{"THEN",	SKIP},
+	{"TO",		NORM('t')},
+	{"TYPE",	NORM('T')},
+	{"UNTIL",	NORM('u')},
+	{"VAR",		NORM('v')},
+	{"WHILE",	NORM('w')},
+	{"WITH",	NORM('W')},
+};
+
+static const struct idf standard[] = {
+	{"ABS",		META('a')},
+	{"ADDRESS",	META('A')},
+	{"ALLOCATE",	MTCT('A')},
+	{"BITSET",	META('b')},
+	{"BOOLEAN",	META('B')},
+	{"CAP",		META('c')},
+	{"CARDINAL",	META('C')},
+	{"CHAR",	MTCT('C')},
+	{"CHR",		META('x')},
+	{"DEALLOCATE",	META('d')},
+	{"DEC",		META('D')},
+	{"EXCL",	META('e')},
+	{"FALSE",	META('f')},
+	{"FLOAT",	META('F')},
+	{"HALT",	META('h')},
+	{"HIGH",	META('H')},
+	{"INC",		META('i')},
+	{"INCL",	META('I')},
+	{"INTEGER",	MTCT('I')},
+	{"LONGCARD",	META('L')},
+	{"LONGINT",	META('L')},
+	{"LONGREAL",	META('L')},
+	{"MAX",		META('m')},
+	{"MIN",		META('M')},
+	{"NEWPROCESS",	META('n')},
+	{"NIL",		META('N')},
+	{"ODD",		META('o')},
+	{"ORD",		META('O')},
+	{"PROC",	META('p')},
+	{"REAL",	META('r')},
+	{"SIZE",	META('s')},
+	{"SYSTEM",	META('S')},
+	{"TRANSFER",	META('t')},
+	{"TRUE",	META('T')},
+	{"TRUNC",	MTCT('T')},
+	{"VAL",		META('v')},
+	{"WORD",	META('w')}
+};
+
+/* Special treatment of identifiers */
+
+static TOKEN
+idf2token(int hashing) {
+	register TOKEN tk;
+
+	/* the token can be on two lists, reserved and standard */
+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
+
+	/* is it one of the keywords to be ignored? */
+	if (TOKEN_EQ(tk, SKIP)) return tk;
+
+	/*	The statement below is a significant comment
+		on the value of state variables.
+	*/
+	if (!TOKEN_EQ(tk, IDF)) {
+		/* reserved word, stop the skipping */
+		skip_imports = 0;
+	}
+	else {
+		/* it is an identifier but not a reserved word */
+		if (skip_imports) {
+			/* skip it */
+			tk = 0;
+		}
+		else {
+			/* look further */
+			tk = idf_in_list(yytext, standard, sizeof standard, IDF);
+			if (TOKEN_EQ(tk, IDF) && hashing) {
+				/* return a one-token hash code */
+				tk = idf_hashed(yytext);
+			}
+		}
+	}
+	return tk;
+}
+
+/* Token sets for module algollike */
+const TOKEN NonFinals[] = {
+	IDF,		/* identifier */
+	NORM('{'),	/* also BEGIN */
+	NORM('('),
+	NORM('['),
+	NORM('A'),	/* ARRAY */
+	NORM('c'),	/* CASE */
+	NORM('C'),	/* CONST */
+	NORM('E'),	/* EXIT */
+	NORM('F'),	/* FOR */
+	NORM('i'),	/* IF */
+	NORM('l'),	/* LOOP */
+	NORM('p'),	/* POINTER */
+	NORM('P'),	/* PROCEDURE */
+	NORM('r'),	/* RECORD */
+	NORM('R'),	/* REPEAT */
+	CTRL('R'),	/* RETURN */
+	NORM('s'),	/* SET */
+	NORM('T'),	/* TYPE */
+	NORM('v'),	/* VAR */
+	NORM('w'),	/* WHILE */
+	NORM('W'),	/* WITH */
+	NOTOKEN
+};
+const TOKEN NonInitials[] = {
+	NORM('}'),
+	NORM(')'),
+	NORM(']'),
+	NORM(';'),
+	NOTOKEN
+};
+const TOKEN Openers[] = {
+	NORM('{'),
+	NORM('('),
+	NORM('['),
+	NOTOKEN
+};
+const TOKEN Closers[] = {
+	NORM('}'),
+	NORM(')'),
+	NORM(']'),
+	NOTOKEN
+};
+
+%}
+
+%option nounput
+%option never-interactive
+
+%Start	Comment
+
+Layout		([ \t\r\f])
+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
+
+AnyQuoted	(\\.)
+QuStrChar	([^"\n\\]|{AnyQuoted})
+ApoStrChar	([^'\n\\]|{AnyQuoted})
+
+StartComment	("(*")
+EndComment	("*)")
+SafeComChar	([^*\n])
+UnsafeComChar	("*")
+
+Digit		([0-9a-fA-F])
+Idf		([A-Za-z][A-Za-z0-9_]*)
+
+%%
+
+{StartComment}	{			/* See clang.l */
+		/*	Lex itself is incapable of handling Modula-2's
+			nested comments. So let's help it a bit.
+		*/
+		if (comment_level == 0) {
+			BEGIN Comment;
+		}
+		comment_level++;
+	}
+
+<Comment>{SafeComChar}+	{		/* safe comment chunk */
+	}
+
+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
+	}
+
+<Comment>"\n"		{		/* to break up long comments */
+		return_eol();
+	}
+
+<Comment>{EndComment}	{		/* end-of-comment */
+		comment_level--;
+		if (comment_level == 0) {
+			BEGIN INITIAL;
+		}
+	}
+
+\"{QuStrChar}*\"	{		/* quoted strings */
+		return_ch('"');
+	}
+
+\'{ApoStrChar}*\'	{		/* apostrophed strings */
+		return_ch('"');
+	}
+
+{Digit}+("B"|"C"|"H")?	{		/* numeral, passed as an identifier */
+		return_tk(IDF);
+	}
+
+"END"{Layout}*{Idf}	{		/* ignore identifier after END */
+		return_tk(idf_in_list("END", reserved, sizeof reserved, SKIP));
+	}
+
+{Idf}/"("	{			/* identifier in front of ( */
+		register TOKEN tk;
+
+		tk = idf2token(option_set('F'));
+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
+	}
+
+{Idf}	{				/* identifier */
+		register TOKEN tk;
+
+		tk = idf2token(0 /* no hashing */);
+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
+	}
+
+"<>"	{				/* <>, special equivalence */
+		return_ch('#');
+	}
+
+\;	{				/* semicolon, conditionally ignored */
+		if (option_set('f')) return_ch(yytext[0]);
+	}
+
+\n	{				/* count newlines */
+		return_eol();
+	}
+
+{Layout}	{			/* ignore layout */
+	}
+
+{ASCII95}	{			/* copy other text */
+		if (!skip_imports) return_ch(yytext[0]);
+	}
+
+.	{				/* count non-ASCII chars */
+		lex_non_ascii_cnt++;
+	}
+
+%%
+
+/* Language-INdependent Code */
+
+void
+yystart(void) {
+	skip_imports = 1;
+	comment_level = 0;
+	BEGIN INITIAL;
+}
+
+int
+yywrap(void) {
+	return 1;
+}

+ 131 - 0
utils/sim_pasc/miralang.l

@@ -0,0 +1,131 @@
+%{
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: miralang.l,v 1.3 2007/08/29 09:10:34 dick Exp $
+*/
+
+/*
+	Miranda language front end for the similarity tester.
+	Author:	Emma Norling ([email protected])
+	Date:	Nov 1998
+*/
+
+#include	"language.h"
+#include	"token.h"
+#include	"lex.h"
+#include	"lang.h"
+
+/* Language-dependent Code */
+#include	"idf.h"
+
+static const struct idf reserved[] = {
+	{"abstype",	NORM('a')},
+	{"bool",	NORM('b')},
+	{"char",	NORM('c')},
+	{"const",	META('c')},
+	{"div",		NORM('d')},
+	{"False",	NORM('F')},
+	{"if",		NORM('i')},
+	{"mod",		NORM('m')},
+	{"num",		NORM('n')},
+	{"otherwise",	NORM('o')},
+	{"readvals",	NORM('r')},
+	{"show",	NORM('s')},
+	{"sys_message",	META('s')},
+	{"True",	NORM('T')},
+	{"type",	NORM('t')},
+	{"where",	NORM('w')},
+	{"with",	META('w')}
+};
+
+/* Token sets for module algollike */
+const TOKEN NonFinals[] = {
+	NORM('('),
+	NORM('['),
+	NORM('='),
+	NOTOKEN
+};
+const TOKEN NonInitials[] = {
+	NORM(')'),
+	NORM(']'),
+	NOTOKEN
+};
+const TOKEN Openers[] = {
+	NORM('('),
+	NORM('['),
+	NORM('='),
+	NOTOKEN
+};
+const TOKEN Closers[] = {
+	NORM(')'),
+	NORM(']'),
+	NOTOKEN
+};
+
+%}
+
+%option nounput
+%option never-interactive
+
+%Start	Comment
+
+Layout		([ \t\r\f])
+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
+
+AnyQuoted	(\\.)
+StrChar		([^"\n\\]|{AnyQuoted})
+ChrChar		([^'\\]|{AnyQuoted})
+
+Idf		([A-Za-z][A-Za-z0-9_']*)
+
+%%
+
+"||".*$	{				/* comment */
+	}
+
+\"{StrChar}*\"	{			/* strings */
+		return_ch('"');
+	}
+
+\'{ChrChar}\'	{			/* characters */
+		return_ch('\'');
+	}
+
+\%{Layout}*include.*	{		/* skip %include line */
+	}
+
+\%{Layout}*insert.*	{		/* skip %insert line */
+	}
+
+{Idf}	{				/* identifier */
+		return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF));
+	}
+
+\n	{				/* count newlines */
+		return_eol();
+	}
+
+{Layout}	{			/* ignore layout */
+	}
+
+{ASCII95}	{			/* copy other text */
+		return_ch(yytext[0]);
+	}
+
+.	{				/* count non-ASCII chars */
+		lex_non_ascii_cnt++;
+	}
+
+%%
+
+/* Language-INdependent Code */
+
+void
+yystart(void) {
+	BEGIN INITIAL;
+}
+
+int
+yywrap(void) {
+	return 1;
+}

+ 123 - 0
utils/sim_pasc/options.c

@@ -0,0 +1,123 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: options.c,v 1.3 2001/11/13 12:55:53 dick Exp $
+*/
+
+#include	<stdio.h>
+#include	<stdlib.h>
+
+#include	"options.h"
+
+static char options[128];
+
+static void bad_option(
+	const char *progname, const struct option *optlist, char *msg, int c
+);
+static int opt_value(const struct option *op, const char *arg, char *argv[]);
+
+static int do_arg(
+	const char *progname, const struct option *optlist,
+	const char *arg, char *argv[]
+);
+
+int
+do_options(
+	const char *progname, const struct option *optlist,
+	int argc, char *argv[]
+) {
+	int skips = 0;
+
+	while (argc > 0 && argv[0][0] == '-' && argv[0][1] != '\0') {
+		int consumed = do_arg(progname, optlist, &argv[0][1], argv);
+
+		argc -= consumed, argv += consumed, skips += consumed;
+	}
+
+	return skips;
+}
+
+int
+option_set(char ch) {
+	return options[(int)ch];
+}
+
+static int
+do_arg(
+	const char *progname, const struct option *optlist,
+	const char *arg, char *argv[]
+) {
+	int consumed = 0;
+
+	while (*arg) {
+		/* treat argument character */
+		register char opc = *arg++;
+		register const struct option *op;
+
+		for (op = optlist; op->op_char; op++) {
+			/* for every allowed option */
+			if (opc == op->op_char) {
+				options[(int)opc]++;
+				if (op->op_indicator != ' ') {
+					consumed = opt_value(op, arg, argv);
+					if (consumed < 0) {
+						bad_option(progname, (struct option *)0,
+							" option -%c requires another argument",
+							op->op_char
+						);
+						/*NOTREACHED*/
+					}
+				}
+				break;
+			}
+		}
+		if (!op->op_char) {
+			bad_option(progname, optlist,
+				"*option -%c unknown", opc
+			);
+			/*NOTREACHED*/
+		}
+		if (consumed) break;
+	}
+	if (!consumed) {
+		consumed = 1;
+	}
+	
+	return consumed;
+}
+
+static int
+opt_value(const struct option *op, const char *arg, char *argv[]) {
+	/* locate the option value */
+	if (*arg) {
+		/* argument is continuation of option */
+		*op->op_stringp = arg;
+		return 1;
+	}
+	else {
+		/* argument follows option */
+		if (!argv[1]) return -1;
+		*op->op_stringp = argv[1];
+		return 2;
+	}
+}
+
+static void
+bad_option(
+	const char *progname, const struct option *optlist, char *msg, int c
+) {
+	fprintf(stderr, "%s: ", progname);
+	fprintf(stderr, &msg[1], c);
+	fprintf(stderr, "\n");
+
+	if (msg[0] != ' ') {
+		register const struct option *op;
+
+		fprintf(stderr, "Possible options are:\n");
+		for (op = optlist; op->op_char; op++) {
+			fprintf(stderr, "\t-%c%c\t%s\n",
+				op->op_char, op->op_indicator, op->op_text
+			);
+		}
+	}
+	exit(1);
+}

+ 20 - 0
utils/sim_pasc/options.h

@@ -0,0 +1,20 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: options.h,v 1.3 2001/11/13 12:55:53 dick Exp $
+*/
+
+/*	Setting and consulting command line options
+*/
+
+struct option {
+	char op_char;		/* char as in call */
+	char *op_text;		/* elucidating text */
+	char op_indicator;	/* type indicator, N = int, F = file name */
+	const char **op_stringp;/* string value to be picked up */
+};
+
+extern int option_set(char ch);
+extern int do_options(
+	const char *progname, const struct option *optlist,
+	int argc, char *argv[]
+);

+ 256 - 0
utils/sim_pasc/pascallang.l

@@ -0,0 +1,256 @@
+%{
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: pascallang.l,v 2.9 2007/08/29 09:10:35 dick Exp $
+*/
+
+/*
+	PASCAL language front end for the similarity tester.
+	Author:	Maarten van der Meulen <[email protected]>
+*/
+
+#include	"options.h"
+#include	"algollike.h"
+#include	"token.h"
+#include	"idf.h"
+#include	"lex.h"
+#include	"lang.h"
+
+/* Language-dependent Code */
+
+/* Data for module idf */
+
+static const struct idf ppcmd[] = {
+	{"define",	META('d')},
+	{"else",	META('e')},
+	{"endif",	META('E')},
+	{"if",		META('i')},
+	{"ifdef",	META('I')},
+	{"ifndef",	META('x')},
+	{"include",	MTCT('I')},
+	{"line",	META('l')},
+	{"undef",	META('u')}
+};
+
+static const struct idf reserved[] = {
+	{"and",		NORM('&')},
+	{"array",	NORM('A')},
+	{"begin",	NORM('{')},
+	{"case",	NORM('c')},
+	{"const",	NORM('C')},
+	{"div",		NORM('/')},
+	{"do",		NORM('D')},
+	{"downto",	NORM('d')},
+	{"else",	NORM('e')},
+	{"end",		NORM('}')},
+	{"extern",	CTRL('E')},
+	{"file",	NORM('F')},
+	{"for",		NORM('f')},
+	{"function",	NORM('p')},	/* Equal to procedure */
+	{"goto",	NORM('g')},
+	{"if",		NORM('i')},
+	{"in",		NORM('I')},
+	{"label",	NORM('l')},
+	{"mod",		NORM('%')},
+	{"nil",		NORM('n')},
+	{"not",		NORM('!')},
+	{"of",		SKIP},
+	{"or",		NORM('|')},
+	{"packed",	NORM('P')},
+	{"procedure",	NORM('p')},
+	{"program",	SKIP},
+	{"record",	NORM('r')},
+	{"repeat",	NORM('R')},
+	{"set",		NORM('s')},
+	{"then",	SKIP},
+	{"to",		NORM('t')},
+	{"type",	NORM('T')},
+	{"until",	NORM('u')},
+	{"var",		NORM('v')},
+	{"while",	NORM('w')},
+	{"with",	NORM('W')}
+};
+
+/* Special treatment of identifiers */
+
+static void
+lower_case(char *str) {
+	/*	Turns upper case into lower case, since Pascal does not
+		distinguish between them.
+	*/
+	register char *s;
+
+	for (s = str; *s; s++) {
+		if ('A' <= *s && *s <= 'Z') {
+			*s += (-'A' + 'a');
+		}
+	}
+}
+
+static TOKEN
+idf2token(int hashing) {
+	register TOKEN tk;
+
+	lower_case(yytext);
+	tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
+	if (TOKEN_EQ(tk, IDF) && hashing) {
+		/* return a one-token hash code */
+		tk = idf_hashed(yytext);
+	}
+	return tk;
+}
+
+/* Token sets for module algollike */
+const TOKEN NonFinals[] = {
+	IDF,		/* identifier */
+	NORM('{'),	/* also begin */
+	NORM('('),
+	NORM('['),
+	NORM('A'),	/* array */
+	NORM('c'),	/* case */
+	NORM('C'),	/* const */
+	NORM('/'),	/* div */
+	CTRL('E'),	/* extern */
+	NORM('F'),	/* file */
+	NORM('f'),	/* for */
+	NORM('g'),	/* goto */
+	NORM('i'),	/* if */
+	NORM('l'),	/* label */
+	NORM('P'),	/* packed */
+	NORM('p'),	/* procedure/function */
+	NORM('r'),	/* record */
+	NORM('R'),	/* repeat */
+	NORM('s'),	/* set */
+	NORM('T'),	/* type */
+	NORM('v'),	/* var */
+	NORM('w'),	/* while */
+	NORM('W'),	/* with */
+	NOTOKEN
+};
+const TOKEN NonInitials[] = {
+	NORM(')'),
+	NORM('}'),
+	NORM(';'),
+	NOTOKEN
+};
+const TOKEN Openers[] = {
+	NORM('{'),
+	NORM('('),
+	NORM('['),
+	NOTOKEN
+};
+const TOKEN Closers[] = {
+	NORM('}'),
+	NORM(')'),
+	NORM(']'),
+	NOTOKEN
+};
+
+%}
+
+%option nounput
+%option never-interactive
+
+%Start	Comment
+
+Layout		([ \t\r\f])
+ASCII95		([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
+
+AnyQuoted	(\\.)
+StrChar		([^'\n\\]|{AnyQuoted})
+
+StartComment	("{"|"(*")
+EndComment	("}"|"*)")
+SafeComChar	([^*}\n])
+UnsafeComChar	("*")
+
+Digit		([0-9])
+Idf		([A-Za-z][A-Za-z0-9_]*)
+
+%%
+
+{StartComment}	{			/* See clang.l */
+		BEGIN Comment;
+	}
+
+<Comment>{SafeComChar}+	{		/* safe comment chunk */
+	}
+
+<Comment>{UnsafeComChar}	{	/* unsafe char, read one by one */
+	}
+
+<Comment>"\n"		{		/* to break up long comments */
+		return_eol();
+	}
+
+<Comment>{EndComment}	{		/* end-of-comment */
+		BEGIN INITIAL;
+	}
+
+\'{StrChar}*\'	{			/* character strings */
+		return_ch('"');
+	}
+
+^#{Layout}*include.*	{		/* ignore #include lines */
+	}
+
+^#{Layout}*{Idf}	{		/* a preprocessor line */
+		register char *idf = yytext+1;
+
+		/* skip layout in front of preprocessor identifier */
+		while (*idf == ' ' || *idf == '\t') {
+			idf++;
+		}
+		return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#')));
+	}
+
+{Digit}+	{			/* numeral, passed as an identifier */
+		return_tk(IDF);
+	}
+
+{Idf}/"("	{			/* identifier in front of ( */
+		register TOKEN tk;
+
+		tk = idf2token(option_set('F'));
+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
+	}
+
+{Idf}	{				/* identifier */
+		register TOKEN tk;
+
+		tk = idf2token(0 /* no hashing */);
+		if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
+	}
+
+\;	{				/* semicolon, conditionally ignored */
+		if (option_set('f')) return_ch(yytext[0]);
+	}
+
+\n	{				/* count newlines */
+		return_eol();
+	}
+
+{Layout}	{			/* ignore layout */
+	}
+
+{ASCII95}	{			/* copy other text */
+		return_ch(yytext[0]);
+	}
+
+.	{				/* count non-ASCII chars */
+		lex_non_ascii_cnt++;
+	}
+
+%%
+
+/* Language-INdependent Code */
+
+void
+yystart(void) {
+	BEGIN INITIAL;
+}
+
+int
+yywrap(void) {
+	return 1;
+}

+ 119 - 0
utils/sim_pasc/pass1.c

@@ -0,0 +1,119 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: pass1.c,v 2.8 2007/08/27 09:57:32 dick Exp $
+*/
+
+#include	<stdio.h>
+#include	<string.h>
+
+#include	"debug.par"
+#include	"sim.h"
+#include	"text.h"
+#include	"tokenarray.h"
+#include	"lex.h"
+#include	"error.h"
+#include	"pass1.h"
+
+#ifdef	DB_TEXT
+static void db_print_text(const struct text *);
+#endif
+
+static void print_count(unsigned int cnt, const char *);
+
+void
+Pass1(int argc, char *argv[]) {
+	register int n;
+
+	InitText(argc);
+	InitTokenArray();
+
+	/* assume all texts to be new */
+	NumberOfNewTexts = NumberOfTexts;
+
+	/* read the files */
+	for (n = 0; n < NumberOfTexts; n++) {
+		register char *fname = argv[n];
+		register struct text *txt = &Text[n];
+
+		fprintf(OutputFile, "File %s: ", fname);
+
+		txt->tx_fname = fname;
+		txt->tx_pos = 0;
+		txt->tx_start =
+		txt->tx_limit = TextLength();
+		if (strcmp(fname, "/") == 0) {
+			fprintf(OutputFile, "separator\n");
+			NumberOfNewTexts = n;
+		}
+		else {
+			if (!OpenText(First, txt)) {
+				fprintf(OutputFile, ">>>> cannot open <<<< ");
+				/*	the file has still been opened
+					with a null file for uniformity
+				*/
+			}
+			while (NextTextTokenObtained(First)) {
+				if (!TOKEN_EQ(lex_token, EOL)) {
+					StoreToken();
+				}
+			}
+			CloseText(First, txt);
+			txt->tx_limit = TextLength();
+
+			/* report */
+			print_count(txt->tx_limit - txt->tx_start, "token");
+			if (lex_non_ascii_cnt) {
+				fprintf(DebugFile, ", ");
+				print_count(lex_non_ascii_cnt,
+					"non-ASCII character"
+				);
+			}
+			fprintf(OutputFile, "\n");
+#ifdef	DB_TEXT
+			db_print_text(txt);
+#endif	/* DB_TEXT */
+		}
+		fflush(OutputFile);
+	}
+
+	/* report total */
+	fprintf(OutputFile, "Total: ");
+	print_count(TextLength() - 1, "token");
+	fprintf(OutputFile, "\n\n");
+	fflush(OutputFile);
+}
+
+static void
+print_count(unsigned int cnt, const char *unit) {
+	/*	Prints a grammatically correct string "%u %s[s]"
+		for units that form their plural by suffixing -s.
+	*/
+	fprintf(OutputFile, "%u %s%s", cnt, unit, (cnt == 1 ? "" : "s"));
+}
+
+#ifdef	DB_TEXT
+
+static void
+db_print_text(const struct text *txt) {
+	/* prints a text (in compressed form) */
+	register int i;
+
+	fprintf(DebugFile, "\n\n**** DB_PRINT_TEXT ****\n");
+
+	fprintf(DebugFile, "File \"%s\", %u tokens, ",
+		txt->tx_fname, txt->tx_limit - txt->tx_start
+	);
+	fprintf(DebugFile, "txt->tx_start = %u, txt->tx_limit = %u\n",
+		txt->tx_start, txt->tx_limit
+	);
+
+	for (i = txt->tx_start; i < txt->tx_limit; i++) {
+		if ((i - txt->tx_start + 1) % 32 == 0) {
+			fprintf(DebugFile, "\n");
+		}
+		print_token(stdout, TokenArray[i]);
+	}
+	fprintf(DebugFile, "\n");
+}
+
+#endif	/* DB_TEXT */

+ 9 - 0
utils/sim_pasc/pass1.h

@@ -0,0 +1,9 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: pass1.h,v 1.3 2001/09/28 09:03:50 dick Exp $
+*/
+
+/*	Reads the input files; stores the tokens in TOKEN TokenArray[]
+	and the input file descriptions in struct text text[].
+*/
+extern void Pass1(int argc, char *argv[]);

+ 154 - 0
utils/sim_pasc/pass2.c

@@ -0,0 +1,154 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: pass2.c,v 2.10 2004/08/05 09:49:46 dick Exp $
+*/
+
+#include	<stdio.h>
+
+#include	"debug.par"
+#include	"sim.h"
+#include	"text.h"
+#include	"lex.h"
+#include	"pass2.h"
+
+#ifdef	DB_POS
+static void db_print_pos_list(const char *, const struct position *);
+static void db_print_lex(const char *);
+#endif
+
+static void pass2_txt(struct text *txt);
+static int next_eol_obtained(void);
+
+void
+Pass2(void) {
+	int n;
+
+	for (n = 0; n < NumberOfTexts; n++) {
+		pass2_txt(&Text[n]);
+	}
+}
+
+/* instantiate sort_pos_list() */
+#define	SORT_STRUCT		position
+#define	SORT_NAME		sort_pos_list
+#define	SORT_BEFORE(p1,p2)	((p1)->ps_tk_cnt < (p2)->ps_tk_cnt)
+#define	SORT_NEXT		ps_next
+#include	"sortlist.bdy"
+
+static void
+pass2_txt(struct text *txt) {
+	register struct position *pos;
+	register unsigned int old_nl_cnt;
+
+	if (!txt->tx_pos)	/* no need to scan the file */
+		return;
+
+	if (!OpenText(Second, txt)) {
+		fprintf(stderr, ">>>> File %s disappeared <<<<\n",
+			txt->tx_fname
+		);
+	}
+	/* sets lex_nl_cnt and lex_tk_cnt */
+
+#ifdef	DB_POS
+	db_print_pos_list("before sorting", txt->tx_pos);
+#endif	/* DB_POS */
+
+	sort_pos_list(&txt->tx_pos);
+
+#ifdef	DB_POS
+	db_print_pos_list("after sorting", txt->tx_pos);
+#endif	/* DB_POS */
+
+#ifdef	DB_NL_BUFF
+	db_print_nl_buff(txt->tx_nl_start, txt->tx_nl_limit);
+#endif	/* DB_NL_BUFF */
+
+	old_nl_cnt = 1;
+	pos = txt->tx_pos;
+	while (pos) {
+		/* we scan the pos list and the file in parallel */
+
+		/* find the corresponding line */
+		while (pos->ps_tk_cnt >= lex_tk_cnt) {
+			/* pos does not refer to this line, try the next */
+
+			/* shift the administration */
+			old_nl_cnt = lex_nl_cnt;
+			/* and get the next eol position */
+			if (!next_eol_obtained()) {
+				/* ouch! not enough lines! */
+				fprintf(stderr, ">>>> File %s modified <<<<\n",
+					txt->tx_fname
+				);
+				break;
+			}
+#ifdef	DB_POS
+			db_print_lex(txt->tx_fname);
+#endif	/* DB_POS */
+		}
+
+		/* fill in the pos */
+		switch (pos->ps_type) {
+		case 0:	/* first token of run */
+			pos->ps_nl_cnt = old_nl_cnt;
+			break;
+		case 1:	/* last token of run */
+			pos->ps_nl_cnt = lex_nl_cnt;
+			break;
+		}
+		/* and get the next pos */
+		pos = pos->ps_next;
+	}
+
+#ifdef	DB_POS
+	db_print_pos_list("after scanning", txt->tx_pos);
+#endif	/* DB_POS */
+
+	CloseText(Second, txt);
+}
+
+static int
+next_eol_obtained(void) {
+	while (NextTextTokenObtained(Second)) {
+		if (TOKEN_EQ(lex_token, EOL)) return 1;
+	}
+	return 0;
+}
+
+#ifdef	DB_POS
+
+static void
+db_print_pos(const struct position *pos) {
+	fprintf(DebugFile, "pos type: %s; token count: %u",
+		(pos->ps_type == 0 ? "first" : " last"),
+		pos->ps_tk_cnt
+	);
+	fprintf(DebugFile, ", line#: ");
+	if (pos->ps_nl_cnt == -1) {
+		fprintf(DebugFile, "<NOT SET>");
+	}
+	else {
+		fprintf(DebugFile, "%u", pos->ps_nl_cnt);
+	}
+	fprintf(DebugFile, "\n");
+}
+
+static void
+db_print_pos_list(const char *msg, const struct position *pos) {
+	fprintf(DebugFile, "\n**** DB_PRINT_POS_LIST, %s ****\n", msg);
+
+	while (pos) {
+		db_print_pos(pos);
+		pos = pos->ps_next;
+	}
+	fprintf(DebugFile, "\n");
+}
+
+static void
+db_print_lex(const char *fn) {
+	fprintf(DebugFile, "%s: lex_tk_cnt = %u, lex_nl_cnt = %u\n",
+		fn, lex_tk_cnt, lex_nl_cnt);
+}
+
+#endif	/* DB_POS */

+ 9 - 0
utils/sim_pasc/pass2.h

@@ -0,0 +1,9 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: pass2.h,v 1.2 1998/01/21 14:27:58 dick Exp $
+*/
+
+/*	Determines for each position that is part of a run, at which
+	line number it starts and ends.
+*/
+extern void Pass2(void);

+ 356 - 0
utils/sim_pasc/pass3.c

@@ -0,0 +1,356 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: pass3.c,v 2.11 2005/02/20 17:03:03 dick Exp $
+*/
+
+#include	<stdio.h>
+#include	<string.h>
+#include	<malloc.h>
+
+#include	"system.par"
+#include	"debug.par"
+#include	"sim.h"
+#include	"runs.h"
+#include	"error.h"
+#include	"options.h"
+#include	"pass3.h"
+#include	"percentages.h"
+
+#ifdef	DB_RUN
+#include	"tokenarray.h"
+static void db_run(const struct run *);
+#endif
+
+static FILE *open_chunk(const struct chunk *);
+static void fill_line(FILE *, char []);
+static void clear_line(char []);
+static void show_runs(void);
+static void show_run(const struct run *);
+static void show_2C_line(const char [], const char []);
+static void show_1C_line(FILE *, const char *);
+static int prhead(const struct chunk *);
+static int prs(const char *);
+static int pru(unsigned int);
+static int unslen(unsigned int);
+
+static int maxline;			/* Actual maximum line length */
+static char *line0;			/* by malloc() */
+static char *line1;
+
+void
+Pass3(void) {
+	if (option_set('p')) {
+		show_percentages();
+	}
+	else {
+		show_runs();
+	}
+}
+
+static void
+show_runs(void) {
+	AisoIter iter;
+	struct run *run;
+
+	maxline = PageWidth / 2 - 2;
+	line0 = malloc((unsigned int)((maxline + 1) * sizeof (char)));
+	line1 = malloc((unsigned int)((maxline + 1) * sizeof (char)));
+	if (!line0 || !line1) fatal("out of memory");
+
+	OpenIter(&iter);
+	while (GetAisoItem(&iter, &run)) {
+#ifdef	DB_RUN
+		db_run(run);
+#endif	/* DB_RUN */
+		show_run(run);
+		fprintf(OutputFile, "\n");
+	}
+	CloseIter(&iter);
+
+	free(line0); line0 = 0;
+	free(line1); line1 = 0;
+}
+
+static void
+show_run(const struct run *run) {
+	/* The animals came in two by two ... */
+	register const struct chunk *cnk0 = &run->rn_cn0;
+	register const struct chunk *cnk1 = &run->rn_cn1;
+	register unsigned int nl_cnt0 =
+			cnk0->ch_last.ps_nl_cnt - cnk0->ch_first.ps_nl_cnt;
+	register unsigned int nl_cnt1 =
+			cnk1->ch_last.ps_nl_cnt - cnk1->ch_first.ps_nl_cnt;
+	FILE *f0;
+	FILE *f1;
+
+	/* display heading of chunk */
+	if (!option_set('d')) {
+		/* no assumptions about the lengths of the file names! */
+		register unsigned int size = run->rn_size;
+		register int pos = 0;
+
+		pos += prhead(cnk0);
+		while (pos < maxline + 1) {
+			pos += prs(" ");
+		}
+		pos += prs("|");
+		pos += prhead(cnk1);
+		while (pos < 2*maxline - unslen(size)) {
+			pos += prs(" ");
+		}
+		fprintf(OutputFile, "[%u]\n", size);
+	}
+	else {
+		(void)prhead(cnk0);
+		fprintf(OutputFile, "\n");
+		(void)prhead(cnk1);
+		fprintf(OutputFile, "\n");
+	}
+
+	/* stop if that suffices */
+	if (option_set('n'))
+		return;			/* ... had enough so soon ... */
+
+	/* open the files that hold the chunks */
+	f0 = open_chunk(cnk0);
+	f1 = open_chunk(cnk1);
+
+	/* display the chunks in the required format */
+	if (!option_set('d')) {
+		/* fill 2-column lines and print them */
+		while (nl_cnt0 != 0 || nl_cnt1 != 0) {
+			if (nl_cnt0) {
+				fill_line(f0, line0);
+				nl_cnt0--;
+			}
+			else {
+				clear_line(line0);
+			}
+			if (nl_cnt1) {
+				fill_line(f1, line1);
+				nl_cnt1--;
+			}
+			else {
+				clear_line(line1);
+			}
+			show_2C_line(line0, line1);
+		}
+	}
+	else {
+		/* display the lines in a diff(1)-like format */
+		while (nl_cnt0--) {
+			show_1C_line(f0, "<");
+		}
+		fprintf(OutputFile, "---\n");
+		while (nl_cnt1--) {
+			show_1C_line(f1, ">");
+		}
+	}
+
+	/* close the pertinent files */
+	fclose(f0);
+	fclose(f1);
+}
+
+static int
+prhead(const struct chunk *cnk) {
+	register int pos = 0;
+
+	pos += prs(cnk->ch_text->tx_fname);
+	pos += prs(": line ");
+	pos += pru(cnk->ch_first.ps_nl_cnt);
+	pos += prs("-");
+	pos += pru(cnk->ch_last.ps_nl_cnt - 1);
+	return pos;
+}
+
+static int
+prs(const char *str) {
+	fprintf(OutputFile, "%s", str);
+	return strlen(str);
+}
+
+static int
+pru(unsigned int u) {
+	fprintf(OutputFile, "%u", u);
+	return unslen(u);
+}
+
+static int
+unslen(unsigned int u) {
+	register int res = 1;
+
+	while (u > 9) {
+		u /= 10, res++;
+	}
+	return res;
+}
+
+static FILE *
+open_chunk(const struct chunk *cnk) {
+	/*	opens the file in which the chunk resides, positions the
+		file at the beginning of the chunk and returns the file pointer
+	*/
+	register char *fname = cnk->ch_text->tx_fname;
+	register FILE *f = fopen(fname, "r");
+	register unsigned int nl_cnt;
+
+	if (!f) {
+		fprintf(stderr, ">>>> File %s disappeared <<<<\n", fname);
+		f = fopen(NULLFILE, "r");
+	}
+
+	nl_cnt = cnk->ch_first.ps_nl_cnt;
+	while (nl_cnt > 1) {
+		int ch = getc(f);
+
+		if (ch < 0) break;
+		if (ch == '\n') {
+			nl_cnt--;
+		}
+	}
+
+	return f;
+}
+
+static void
+fill_line(FILE *f, char ln[]) {
+	/*	Reads one line from f and puts it in condensed form in ln.
+	*/
+	register int indent = 0, lpos = 0;
+	register int ch;
+
+	/* condense and skip initial blank */
+	while ((ch = getc(f)), ch == ' ' || ch == '\t') {
+		if (ch == '\t') {
+			indent = 8;
+		}
+		else {
+			indent++;
+		}
+		if (indent == 8) {
+			/* every eight blanks give one blank */
+			if (lpos < maxline) {
+				ln[lpos++] = ' ';
+			}
+			indent = 0;
+		}
+	}
+
+	/* store the rest */
+	while (ch >= 0 && ch != '\n') {
+		if (ch == '\t') {
+			/* replace tabs by blanks */
+			ch = ' ';
+		}
+		if (lpos < maxline) {
+			ln[lpos++] = ch;
+		}
+		ch = getc(f);
+	}
+	ln[lpos] = '\0';		/* always room for this one */
+}
+
+static void
+clear_line(char ln[]) {
+	/* a simple null byte will suffice */
+	ln[0] = '\0';
+}
+
+static void
+show_2C_line(const char ln0[], const char ln1[]) {
+	/*	displays the contents of the two lines in a two-column
+		format
+	*/
+	register int i;
+
+	for (i = 0; i < maxline && ln0[i] != '\0'; i++) {
+		fputc(ln0[i], OutputFile);
+	}
+	for (; i < maxline; i++) {
+		fputc(' ', OutputFile);
+	}
+	fprintf(OutputFile, " |");
+
+	for (i = 0; i < maxline && ln1[i] != '\0'; i++) {
+		fputc(ln1[i], OutputFile);
+	}
+	fprintf(OutputFile, "\n");
+}
+
+static void
+show_1C_line(FILE *f, const char *marker) {
+	/*	displays one line from f, preceded by the marker
+	*/
+	register int ch;
+
+	fprintf(OutputFile, "%s", marker);
+	while ((ch = getc(f)), ch > 0 && ch != '\n') {
+		fputc(ch, OutputFile);
+	}
+	fputc('\n', OutputFile);
+}
+
+#ifdef	DB_RUN
+
+static void db_chunk(const struct chunk *);
+
+static void
+db_run(const struct run *run) {
+	/* prints detailed data about a run */
+	register const struct chunk *cnk0 = &run->rn_cn0;
+	register const struct chunk *cnk1 = &run->rn_cn1;
+
+	fprintf(DebugFile, "File %s / file %s:\n",
+		cnk0->ch_text->tx_fname,
+		cnk1->ch_text->tx_fname
+	);
+	fprintf(DebugFile, "from token %u/%u to %u/%u:",
+		cnk0->ch_first.ps_tk_cnt, cnk1->ch_first.ps_tk_cnt,
+		cnk0->ch_last.ps_tk_cnt, cnk1->ch_last.ps_tk_cnt
+	);
+	fprintf(DebugFile, " from lines %u/%u to %u/%u:",
+		cnk0->ch_first.ps_nl_cnt, cnk1->ch_first.ps_nl_cnt,
+		cnk0->ch_last.ps_nl_cnt, cnk1->ch_last.ps_nl_cnt
+	);
+	fprintf(DebugFile, " %u %s\n",
+		run->rn_size,
+		(run->rn_size == 1 ? "token" : "tokens")
+	);
+
+	db_chunk(cnk0);
+	db_chunk(cnk1);
+}
+
+static void
+db_chunk(const struct chunk *cnk) {
+	/*	print the tokens in the chunk, with a one-char margin
+	*/
+	unsigned int i;
+	const struct position *first = &cnk->ch_first;
+	const struct position *last = &cnk->ch_last;
+	unsigned int start = cnk->ch_text->tx_start;
+
+	if (first->ps_tk_cnt > 0) {
+		fprintf(DebugFile, "...");
+		print_token(stdout, TokenArray[start + first->ps_tk_cnt - 1]);
+		fprintf(DebugFile, "  ");
+	}
+	else {	/* create same offset as above */
+		fprintf(DebugFile, "       ");
+	}
+
+	for (i = first->ps_tk_cnt; i <= last->ps_tk_cnt; i++) {
+		print_token(stdout, TokenArray[start + i]);
+	}
+
+	if (start + last->ps_tk_cnt + 1 < cnk->ch_text->tx_limit) {
+		fprintf(DebugFile, "  ");
+		print_token(stdout, TokenArray[start + last->ps_tk_cnt + 1]);
+		fprintf(DebugFile, "...");
+	}
+
+	fprintf(DebugFile, "\n");
+}
+
+#endif	/* DB_RUN */

+ 7 - 0
utils/sim_pasc/pass3.h

@@ -0,0 +1,7 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: pass3.h,v 1.2 1998/01/21 14:28:01 dick Exp $
+*/
+
+/*	Print the contents of runs */
+extern void Pass3(void);

+ 115 - 0
utils/sim_pasc/percentages.c

@@ -0,0 +1,115 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: percentages.c,v 1.3 2007/08/27 09:57:33 dick Exp $
+*/
+
+#include	<stdio.h>
+#include	<malloc.h>
+
+#include	"sim.h"
+#include	"runs.h"
+#include	"error.h"
+#include	"percentages.h"
+
+struct match {
+	struct match *ma_next;
+	struct text *ma_text0;
+	struct text *ma_text1;
+	unsigned int ma_size;
+};
+
+static struct match *match_start;	/* to be allocated by malloc */
+
+int
+add_to_percentages(struct run *r) {
+	/* fails if out of memory, in line with add_to_run() */
+	struct match **match_hook = &match_start;
+
+	/* percentages are only meaningful between different files */
+	if (r->rn_cn0.ch_text == r->rn_cn1.ch_text) return 1;
+
+	/* look (text0, text1) combination up in match list */
+	while (*match_hook) {
+		struct match *m = *match_hook;
+
+		if (	m->ma_text0 == r->rn_cn0.ch_text
+		&&	m->ma_text1 == r->rn_cn1.ch_text
+		) {
+			/* found it; now update it */
+			m->ma_size += r->rn_size;
+			return 1;
+		}
+		match_hook = &m->ma_next;
+	}
+
+	{	/* it's not there; make a new entry */
+		struct match *m = *match_hook =
+			(struct match *)malloc(sizeof (struct match));
+
+		if (m == 0) return 0;
+		m->ma_next = 0;
+		m->ma_text0 = r->rn_cn0.ch_text;
+		m->ma_text1 = r->rn_cn1.ch_text;
+		m->ma_size = r->rn_size;
+		return 1;
+	}
+}
+
+static void
+add_reverse_entries_to_match_list(void) {
+	struct match **match_hook = &match_start;
+
+	while (*match_hook) {
+		struct match *m = *match_hook;
+		struct match *n =
+			(struct match *)malloc(sizeof (struct match));
+
+		if (!n) fatal("out of memory");
+		/* hook in the double */
+		n->ma_next = m->ma_next;
+		m->ma_next = n;
+		n->ma_text0 = m->ma_text1;
+		n->ma_text1 = m->ma_text0;
+		n->ma_size = m->ma_size;
+
+		match_hook = &n->ma_next;
+	}
+}
+
+static float
+match_percentage(struct match *m) {
+	struct text *text0 = m->ma_text0;
+	int size0 = text0->tx_limit - text0->tx_start;
+
+	return (m->ma_size*1.0/size0);
+}
+
+/* instantiate sort_match_list() */
+#define	SORT_STRUCT		match
+#define	SORT_NAME		sort_match_list
+#define	SORT_BEFORE(p1,p2)	(match_percentage(p1) > match_percentage(p2))
+#define	SORT_NEXT		ma_next
+#include	"sortlist.bdy"
+
+static void
+print_percentages(void) {
+	struct match *m = match_start;
+
+	while (m) {
+		fprintf(OutputFile,
+			"%s consists for %d %% of %s material\n",
+			m->ma_text0->tx_fname,
+			(int)(match_percentage(m)*100.0),
+			m->ma_text1->tx_fname
+		);
+		
+		m = m->ma_next;
+	}
+}
+
+void
+show_percentages(void) {
+	add_reverse_entries_to_match_list();
+	sort_match_list(&match_start);
+	print_percentages();
+}

+ 7 - 0
utils/sim_pasc/percentages.h

@@ -0,0 +1,7 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: percentages.h,v 1.2 2004/08/05 09:49:48 dick Exp $
+*/
+
+extern int add_to_percentages(struct run *r);
+extern void show_percentages(void);

+ 11 - 0
utils/sim_pasc/runs.c

@@ -0,0 +1,11 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: runs.c,v 1.2 2001/11/08 12:30:30 dick Exp $
+*/
+
+#include	"sim.h"
+#include	"runs.h"
+
+#define	AISO_BEFORE(r0,r1)	((r0)->rn_size > (r1)->rn_size)
+
+#include	"aiso.bdy"

+ 33 - 0
utils/sim_pasc/runs.h

@@ -0,0 +1,33 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: runs.h,v 1.2 2001/11/08 12:30:30 dick Exp $
+*/
+
+/*	Although all other segments of data in this program are described by
+	giving the position of the first in the segment and that of the
+	first not in the segment (so the size is the difference of the two),
+	a `chunk' is given by first and last. This is done because later on we
+	are interested in the actual position of the last token of it, and
+	the position of the first token not in the segment gives no
+	indication about that.
+*/
+
+struct chunk {
+	/* a chunk of text in various representations */
+	struct text *ch_text;		/* pointer to the file */
+	struct position ch_first;	/* first in chunk */
+	struct position ch_last;	/* last in chunk */
+};
+
+struct run {				/* a 'run' of coincident tokens */
+	struct chunk rn_cn0;		/* chunk in left file */
+	struct chunk rn_cn1;		/* chunk in right file */
+	unsigned int rn_size;
+};
+
+#define	AISO_TYPE	struct run *
+#define	AISO_ITERATOR
+
+#define	add_to_runs(r)	InsertAiso(r)
+
+#include	"aiso.spc"

+ 8 - 0
utils/sim_pasc/settings.par

@@ -0,0 +1,8 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: settings.par,v 1.1 1997/06/20 12:03:22 dick Exp $
+*/
+
+#define	DFLT_MIN_RUN_SIZE	24	/* default minimum run size */
+
+#define	DFLT_PAGE_WIDTH		80	/* default page width */

+ 176 - 0
utils/sim_pasc/sim.1

@@ -0,0 +1,176 @@
+.\"	This file is part of the software similarity tester SIM.
+.\"	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+.\"	$Id: sim.1,v 2.6 2004/08/05 09:49:49 dick Exp $
+.\"
+.TH SIM 1 2001/11/13 "Vrije Universiteit"
+.SH NAME
+sim \- find similarities in C, Java, Pascal, Modula-2, Lisp, Miranda or text files
+.SH SYNOPSIS
+.B sim_c
+[
+.B \-[defFnpsS]
+.B \-r
+.I N
+.B \-w
+.I N
+.B \-o
+.I F
+]
+file ... [
+.B /
+[ file ... ] ]
+.br
+.B sim_c
+\&...
+.br
+.B sim_java
+\&...
+.br
+.B sim_pasc
+\&...
+.br
+.B sim_m2
+\&...
+.br
+.B sim_lisp
+\&...
+.br
+.B sim_mira
+\&...
+.br
+.B sim_text
+\&...
+.br
+.SH DESCRIPTION
+.I Sim_c
+reads the C files
+.I file ...
+and looks for pieces of text that are similar; two pieces of program text
+are similar if they only differ in layout, comment, identifiers and
+the contents of numbers, strings and characters.
+If any runs of sufficient length
+are found, they are reported on standard output; the number of significant
+tokens in the run is given between square brackets.
+.PP
+.I Sim_java
+does the same for Java,
+.I sim_pasc
+for Pascal,
+.I sim_m2
+for Modula-2,
+.I sim_lisp
+for Lisp, and
+.I sim_mira
+for Miranda.
+.I Sim_text
+works on arbitrary text; it is occasionally useful on shell scripts.
+.PP
+The program can be used for finding copied pieces of code in
+purportedly unrelated programs (with
+.B \-s
+or
+.BR \-S ),
+or for finding accidentally duplicated code in larger projects (with
+.BR \-f ).
+.PP
+If a
+.B /
+is present between the input files, the latter are divided into a group of
+"new" files (before the
+.BR / )
+and a group of "old" files; if there is no
+.BR / ,
+all files are "new".
+Old files are never compared to each other.
+Since the similarity tester
+reads the files several times, it cannot read from standard input.
+.PP
+There are the following options:
+.TP
+.B \-d
+The output is in a diff(1)-like format instead of the default
+2-column format.
+.TP
+.B \-e
+Each file is compared to each file in isolation; this will find all
+similarities between all texts involved, regardless of duplicates.
+.TP
+.B \-f
+Runs are restricted to pieces with balancing parentheses, to isolate
+potential functions (C, Java, Pascal, Modula-2 and Lisp only).
+.TP
+.B \-F
+The names of functions in calls are required to match exactly
+(C, Java, Pascal, Modula-2 and Lisp only).
+.TP
+.B \-n
+Similarities found are only summarized, not displayed.
+.TP
+.B "\-o F"
+The output is written to the file named
+.I F.
+.TP
+.B \-p
+The output is given in similarity percentages; see below.
+.TP
+.B "\-r N"
+The minimum run length is set to
+.I N
+(default is
+.I N
+= 24).
+.TP
+.B \-s
+The contents of a file are not compared to itself (\-s = not self).
+.TP
+.B \-S
+The contents of the new files are compared to the old files only \- not
+between themselves.
+.TP
+.B "\-w N"
+The page width used is set to
+.I N
+columns (default is
+.I N
+= 80).
+.PP
+The
+.B \-p
+option results in lines of the form
+.DS
+.ft 5
+F consists for x % of G material
+.ft P
+.DE
+meaning that \f5x\fP % of \f5F\fP's text can also be found in \f5G\fP.
+Note that this relation is not symmetric; it is in fact quite possible for one
+file to consist for 100 % of text from another file, while the other file
+consists for only 1 % of text of the first file, if their lengths differ
+enough.
+Note also that the granularity of the recognized text is still governed by the
+.B \-r
+option or its default.
+.PP
+Care has been taken to keep all internal processes linear in the length of the
+input, with the exception of the matching process which is almost linear,
+using a hash table; various other tables are used for speed-up.
+If, however, there is not enough memory for the tables, they are discarded in
+order of unimportance, under which conditions the algorithms revert to their
+quadratic nature.
+.SH AUTHOR
+Dick Grune, Vrije Universiteit, Amsterdam.
+.SH BUGS
+Strong periodicity in the input text (like a table of
+.I N
+almost identical lines) causes problems.
+.I Sim
+tries to cope with this but cannot avoid giving appr.\&
+.I log N
+messages about it.
+The best advice is still to take the offending files out of the game.
+.PP
+Since it uses
+.I lex(1)
+on some systems, it may dump core on any weird construction that overflows
+.IR lex 's
+internal buffers.

+ 149 - 0
utils/sim_pasc/sim.c

@@ -0,0 +1,149 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: sim.c,v 2.12 2007/08/27 09:57:34 dick Exp $
+*/
+
+#include	<stdio.h>
+#include	<stdlib.h>
+
+#include	"settings.par"
+#include	"sim.h"
+#include	"options.h"
+#include	"language.h"
+#include	"error.h"
+#include	"hash.h"
+#include	"compare.h"
+#include	"pass1.h"
+#include	"pass2.h"
+#include	"pass3.h"
+#include	"stream.h"
+#include	"lex.h"
+
+unsigned int MinRunSize = DFLT_MIN_RUN_SIZE;
+int PageWidth = DFLT_PAGE_WIDTH;
+FILE *OutputFile;
+FILE *DebugFile;
+
+struct text *Text;			/* to be filled in by malloc */
+int NumberOfTexts;			/* number of text records */
+int NumberOfNewTexts;			/* number of new text records */
+
+char *progname;				/* for error reporting */
+
+static const char *outputname;		/* for reporting */
+static const char *minrunstring;
+static const char *pagewidthstring;
+
+static const struct option optlist[] = {
+	{'r', "minimum run size", 'N', &minrunstring},
+	{'w', "page width", 'N', &pagewidthstring},
+	{'f', "function-like forms only", ' ', 0},
+	{'d', "use diff format for output", ' ', 0},
+	{'p', "use percentage format for output", ' ', 0},
+	{'e', "compare each file to each file separately", ' ', 0},
+	{'s', "do not compare a file to itself", ' ', 0},
+	{'S', "compare new files to old files only", ' ', 0},
+	{'F', "keep function identifiers in tact", ' ', 0},
+	{'n', "display headings only", ' ', 0},
+	{'x', "no pass2 nl_buff allocation", ' ', 0},
+	{'o', "write output to file F", 'F', &outputname},
+	{'-', "lexical scan output only", ' ', 0},
+	{0, 0, 0, 0}
+};
+
+static void print_stream(const char *fname);
+
+int
+main(int argc, char *argv[]) {
+	progname = argv[0];		/* save program name */
+	argv++, argc--;			/* and skip it */
+
+	/* Set the default output and debug streams */
+	OutputFile = stdout;
+	DebugFile = stdout;
+
+	/* Get command line options */
+	{	int nop = do_options(progname, optlist, argc, argv);
+		argc -= nop, argv += nop;	/* skip them */
+	}
+
+	/* Treat the value options */
+	if (minrunstring) {
+		MinRunSize = strtoul(minrunstring, NULL, 10);
+		if (MinRunSize == 0) fatal("bad or zero run size; form is: -r N");
+	}
+	if (pagewidthstring) {
+		PageWidth = atoi(pagewidthstring);
+		if (PageWidth == 0) fatal("bad or zero page width; form is: -w N");
+	}
+	if (outputname) {
+		OutputFile = fopen(outputname, "w");
+		if (OutputFile == 0) {
+			char msg[500];
+
+			sprintf(msg, "cannot open output file %s", outputname);
+			fatal(msg);
+			/*NOTREACHED*/
+		}
+	}
+
+	if (option_set('-')) {
+		/* it is the lexical scan only */
+		while (argv[0]) {
+			print_stream(argv[0]);
+			argv++;
+		}
+		return 0;
+	}
+
+	/* Start processing */
+	InitLanguage();
+
+	/* Read the input files */
+	Pass1(argc, argv);
+
+	/* Set up the forward reference table */
+	MakeForwardReferences();
+
+	/* Compare the input files to find runs */
+	Compare();
+
+	/* Delete forward reference table */
+	FreeForwardReferences();
+
+	/* Find positions of the runs found */
+	Pass2();
+
+	/* Print the similarities */
+	Pass3();
+
+	return 0;
+}
+
+static void
+print_stream(const char *fname) {
+	fprintf(OutputFile, "File %s:", fname);
+	if (!OpenStream(fname)) {
+		fprintf(OutputFile, " cannot open\n");
+		return;
+	}
+
+	fprintf(OutputFile, " showing token stream:\nnl_cnt, tk_cnt: tokens");
+
+	lex_token = EOL;
+	do {
+		if (TOKEN_EQ(lex_token, EOL)) {
+			fprintf(OutputFile, "\n%u,%u:",
+				lex_nl_cnt, lex_tk_cnt
+			);
+		}
+		else {
+			print_token(OutputFile, lex_token);
+		}
+	} while (NextStreamTokenObtained());
+
+	fprintf(OutputFile, "\n");
+
+	CloseStream();
+
+}

+ 39 - 0
utils/sim_pasc/sim.h

@@ -0,0 +1,39 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: sim.h,v 2.7 2005/02/20 17:03:03 dick Exp $
+*/
+
+#include	<stdio.h>
+
+struct position {
+	/* position of first and last token of a chunk */
+	struct position *ps_next;
+	int ps_type;		/* first = 0, last = 1 */
+	unsigned int ps_tk_cnt;	/* in tokens; set by add_run() in Pass 1 */
+	unsigned int ps_nl_cnt;	/* same, in line numbers; set by Pass2(),
+				   used by Pass3() to report line numbers
+				*/
+};
+
+struct text {
+	char *tx_fname;		/* the file name */
+	struct position *tx_pos;/* list of positions in this file that are
+				   part of a chunk; sorted and updated by
+				   Pass 2
+				*/
+	unsigned int tx_start;	/* positions in TokenArray[] for the text */
+	unsigned int tx_limit;
+	unsigned int tx_nl_start;/* possibly newline pointer for pass2 */
+	unsigned int tx_nl_limit;
+};
+
+extern unsigned int MinRunSize;
+extern int PageWidth;
+extern FILE *OutputFile;
+extern FILE *DebugFile;
+
+extern struct text *Text;		/* Text[], one for each input file */
+extern int NumberOfTexts;		/* number of text records */
+extern int NumberOfNewTexts;		/* number of new text records */
+
+extern char *progname;			/* for error reporting */

+ 116 - 0
utils/sim_pasc/sim.html

@@ -0,0 +1,116 @@
+<HTML>
+<!-- $Id: sim.html,v 1.7 2007/08/27 09:57:35 dick Exp $ -->
+<HEAD>
+<TITLE>The software and text similarity tester SIM</TITLE>
+</HEAD>
+
+<BODY>
+<H1>The software and text similarity tester SIM</H1>
+
+<H2>
+<A HREF="http://www.cs.vu.nl/~dick/">Dick Grune</A>
+</H2>
+
+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/README.1st">SIM</A>
+tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp, Miranda,
+and natural language.
+It is used
+<UL>
+
+<LI>
+to detect potentially duplicated code fragments in large software
+projects, in program text, in shell scripts and in documentation
+</LI>
+
+<LI>
+to detect plagiarism in software projects, educational and otherwise
+</LI>
+
+</UL>
+
+<P>
+SIM 2.19 is available as
+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim_2_19.shar">
+C sources</A>
+and as
+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim_2_19.zip">
+MSDOS binaries</A>.
+It is also available through ftp; the directory is
+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester">
+ftp.cs.vu.nl:/pub/dick/similarity_tester</A>.
+There is a
+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/sim.pdf">
+Unix-style manual page</A>.
+</P>
+
+<P>
+The software similarity tester is very efficient and allows us to compare
+this year's students' work with that collected from many past years (much to
+the dismay of some, mostly non-CS, students).
+Students are told that their work is going to be compared, but some are
+non-believers ...
+</P>
+
+<P>
+The output of the similarity tester can be processed by a number of shell
+scripts by Matty Huntjens
+(<A HREF="http://www.cs.vu.nl/~matty/">[email protected]</A>).
+These shell scripts take sim output and produce lists of suspect submissions,
+histograms and the like.
+The present version of these scripts is very much geared to the local
+situation at the
+<A HREF="http://www.vu.nl/">VU University Amsterdam</A>,
+though; they are low on portability.
+</P>
+
+<P>
+We are not afraid that students would try to tune their work to the
+similarity tester.
+We reckon if they can do that they can also do the exercise.
+</P>
+
+<P>
+Since this piece of handicraft does not qualify as research, there are no
+international papers on it.
+The work was described in Dutch in
+Dick Grune,
+Matty Huntjens,
+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/publications/Het_detecteren_van_kopieen_bij_informatica-practica.ps">
+Het detecteren van kopie&euml;n bij informatica-practica</A>,
+Informatie,
+<STRONG>31</STRONG>,
+11,
+Nov 1989,
+pp. 864-867
+(<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/artikel.lit">
+lit. ref.</A>)).
+An
+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/Paper.ps">
+English translation
+</A>
+of the paper is also available.
+The ftp directory contains a terse
+<A HREF="ftp://ftp.cs.vu.nl/pub/dick/similarity_tester/TechnReport">
+technical report</A>
+about the internal workings of the program.
+</P>
+
+<H5>
+<HR>
+[<A HREF="CVS.html">Previous</A>]
+[<A HREF="mag.html">Next</A>]
+[<A HREF="http://www.cs.vu.nl/~dick/dick.html">Personal Page</A>]
+[<A HREF="http://www.cs.vu.nl/~dick/">Professional Page</A>]
+[<A HREF="http://www.cs.vu.nl/">CS</A>]
+[<A HREF="http://www.few.vu.nl/">Faculty</A>]
+[<A HREF="http://www.vu.nl/">VU University Amsterdam</A>]
+<HR>
+</H5>
+
+<ADDRESS>
+The software and text similarity tester SIM / Dick Grune /
+<A HREF="mailto:[email protected]">[email protected]</A>
+</ADDRESS>
+
+</BODY>
+</HTML>

+ 198 - 0
utils/sim_pasc/sim.txt

@@ -0,0 +1,198 @@
+
+
+
+User Commands                                              SIM(1)
+
+
+
+NAME
+     sim - find similarities in C, Java, Pascal, Modula-2,  Lisp,
+     Miranda or text files
+
+SYNOPSIS
+     sim_c [ -[defFnpsS] -r N -w N -o F ] file ... [ / [ file ...
+     ] ]
+     sim_c ...
+     sim_java ...
+     sim_pasc ...
+     sim_m2 ...
+     sim_lisp ...
+     sim_mira ...
+     sim_text ...
+
+DESCRIPTION
+     Sim_c reads the C files file ... and  looks  for  pieces  of
+     text  that are similar; two pieces of program text are simi-
+     lar if they only differ in layout, comment, identifiers  and
+     the  contents  of  numbers,  strings and characters.  If any
+     runs of sufficient length are found, they  are  reported  on
+     standard output; the number of significant tokens in the run
+     is given between square brackets.
+
+     Sim_java does the same for Java, sim_pasc for Pascal, sim_m2
+     for  Modula-2,  sim_lisp for Lisp, and sim_mira for Miranda.
+     Sim_text works on arbitrary text; it is occasionally  useful
+     on shell scripts.
+
+     The program can be used for finding copied pieces of code in
+     purportedly unrelated programs (with -s or -S), or for find-
+     ing accidentally duplicated code in  larger  projects  (with
+     -f).
+
+     If a / is present between the input files,  the  latter  are
+     divided  into  a  group  of "new" files (before the /) and a
+     group of "old" files; if there is no /, all files are "new".
+     Old files are never compared to each other.  Since the simi-
+     larity tester reads the files several times, it cannot  read
+     from standard input.
+
+     There are the following options:
+
+     -d   The output is in a diff(1)-like format instead  of  the
+          default 2-column format.
+
+     -e   Each file is compared to each file in  isolation;  this
+          will  find all similarities between all texts involved,
+          regardless of duplicates.
+
+     -f   Runs  are   restricted   to   pieces   with   balancing
+          parentheses,  to  isolate potential functions (C, Java,
+
+
+
+Vrije Universiteit   Last change: 2001/11/13                    1
+
+
+
+
+
+
+User Commands                                              SIM(1)
+
+
+
+          Pascal, Modula-2 and Lisp only).
+
+     -F   The names of functions in calls are required  to  match
+          exactly (C, Java, Pascal, Modula-2 and Lisp only).
+
+     -n   Similarities found are only summarized, not displayed.
+
+     -o F The output is written to the file named F.
+
+     -p   The output is  given  in  similarity  percentages;  see
+          below.
+
+     -r N The minimum run length is set to N (default is N = 24).
+
+     -s   The contents of a file are not compared to itself (-s =
+          not self).
+
+     -S   The contents of the new files are compared to  the  old
+          files only - not between themselves.
+
+     -w N The page width used is set to N columns (default is N =
+          80).
+
+     The -p option results in lines of the form F consists for  x
+     %  of  G  material  meaning that x % of F's text can also be
+     found in G.  Note that this relation is not symmetric; it is
+     in  fact quite possible for one file to consist for 100 % of
+     text from another file, while the other  file  consists  for
+     only  1 % of text of the first file, if their lengths differ
+     enough.  Note also that the granularity  of  the  recognized
+     text is still governed by the -r option or its default.
+
+     Care has been taken to keep all internal processes linear in
+     the  length of the input, with the exception of the matching
+     process which is almost linear, using a hash table;  various
+     other  tables  are used for speed-up.  If, however, there is
+     not enough memory for the  tables,  they  are  discarded  in
+     order of unimportance, under which conditions the algorithms
+     revert to their quadratic nature.
+
+AUTHOR
+     Dick Grune, Vrije Universiteit, Amsterdam.
+
+BUGS
+     Strong periodicity in the input text  (like  a  table  of  N
+     almost  identical lines) causes problems.  Sim tries to cope
+     with this but cannot avoid giving appr. log N messages about
+     it.   The  best  advice is still to take the offending files
+     out of the game.
+
+     Since it uses lex(1) on some systems, it may  dump  core  on
+     any   weird   construction  that  overflows  lex's  internal
+
+
+
+Vrije Universiteit   Last change: 2001/11/13                    2
+
+
+
+
+
+
+User Commands                                              SIM(1)
+
+
+
+     buffers.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Vrije Universiteit   Last change: 2001/11/13                    3
+
+
+

+ 57 - 0
utils/sim_pasc/sortlist.bdy

@@ -0,0 +1,57 @@
+/*
+	Module:	Sort Linked Lists
+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
+	Version:	Tue Sep 17 17:32:33 1991
+
+Description:
+	This is the implementation part of a generic routine that sorts
+	linked lists.
+
+Instantiation:
+	See sortlist.spc
+*/
+
+#ifndef	_SORT_EXTERN_DEFINED
+static
+#endif
+void
+SORT_NAME(struct SORT_STRUCT **lh) {
+	/*	I've  never known that sorting a linked list was this
+		complicated; what am I missing?
+	*/
+	register struct SORT_STRUCT **listhook = lh;
+
+	while (*listhook) {
+		/* 0. the list is not empty -> there must be a smallest one */
+		register struct SORT_STRUCT **hsmall;
+
+		/* 1. find (the pointer to) the smallest element */
+		{
+			register struct SORT_STRUCT **hook = listhook;
+
+			/* assume initially that first element is smallest */
+			hsmall = hook;
+			while (*hook) {
+				if (SORT_BEFORE(*hook, *hsmall)) {
+					/* revise opinion */
+					hsmall = hook;
+				}
+				hook = &(*hook)->SORT_NEXT;
+			}
+		}
+
+		/* 2. move the smallest element to front */
+		{
+			register struct SORT_STRUCT *smallest = *hsmall;
+
+			/* remove it from the chain */
+			*hsmall = smallest->SORT_NEXT;
+			/* and insert it before the first element */
+			smallest->SORT_NEXT = *listhook;
+			*listhook = smallest;
+		}
+
+		/* 3. skip over smallest element */
+		listhook = &(*listhook)->SORT_NEXT;
+	}
+}

+ 65 - 0
utils/sim_pasc/sortlist.spc

@@ -0,0 +1,65 @@
+/*
+	Module:	Sort Linked Lists
+	Author:	[email protected] (Dick Grune @ Vrije Universiteit, Amsterdam)
+	Version:	Tue Sep 17 17:32:33 1991
+
+Description:
+	This is the specification part of a generic routine that sorts linked
+	lists. The elements in the list are structs, each of which carries a
+	pointer to the next element.
+
+Instantiation, inline:
+	For each struct list type T, specify:
+	-	a definition of SORT_STRUCT, the struct name of the linked
+		structs
+	-	a definition of SORT_NAME, the name of the resulting sort
+		routine
+	-	a definition of a routine
+			int SORT_BEFORE(
+				struct SORT_STRUCT *v, struct SORT_STRUCT *w
+			)
+		which yields non-zero if v is to be sorted before w
+	-	a definition of a field selector SORT_NEXT which names the
+		field that points to the next struct SORT_STRUCT in the list
+	-	#include	"sortlist.bdy"
+
+Instantiation, separate:
+	For each struct list type T, create a file sortT.h which contains at
+	least:
+	-	a definition of SORT_STRUCT, the struct name of the linked
+		structs
+	-	a definition of SORT_NAME, the name of the resulting sort
+		routine
+	-	#include	"sortlist.spc"
+
+	This file sortT.h is to be included in all files that use the routine
+	SORT_NAME.
+
+	For each struct list type T, create a file sortT.c which contains at
+	least:
+	-	#include	"sortT.h"
+	-	a definition of a routine
+			int SORT_BEFORE(
+				struct SORT_STRUCT *v, struct SORT_STRUCT *w
+			)
+		which yields non-zero if v is to be sorted before w
+	-	a definition of a field selector SORT_NEXT which names the
+		field that points to the next struct SORT_STRUCT in the list
+	-	#include	"sortlist.bdy"
+
+	This file sortT.c compiles into the module object for SORT_STRUCT.
+
+Specification:
+	The module supplies:
+	-	void SORT_NAME(struct SORT_STRUCT **listhook)
+		where 'listhook' is a pointer to the location that holds the
+		pointer to the list to be sorted. Upon return, the list will
+		be sorted, and the pointer updated.
+		The routine will be defined static when instantiated inline.
+
+Implementation:
+	Linear insert sort:-(.
+*/
+
+extern void SORT_NAME(struct SORT_STRUCT **);
+#define	_SORT_EXTERN_DEFINED

+ 56 - 0
utils/sim_pasc/stream.c

@@ -0,0 +1,56 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: stream.c,v 2.7 2001/11/08 12:30:32 dick Exp $
+*/
+
+#include	<stdio.h>
+#include	<sys/types.h>
+#include	<sys/stat.h>
+
+#include	"system.par"
+#include	"token.h"
+#include	"lex.h"
+#include	"lang.h"
+#include	"stream.h"
+
+static FILE *fopen_regular_file(const char *fname);
+
+int
+OpenStream(const char *fname) {
+	int ok;
+
+	lex_nl_cnt = 1;
+	lex_tk_cnt = 0;
+	lex_non_ascii_cnt = 0;
+
+	/* start the lex machine */
+	yyin = fopen_regular_file(fname);
+	ok = (yyin != 0);
+	if (!ok) {
+		/* fake a stream, to simplify the rest of the program */
+		yyin = fopen(NULLFILE, "r");
+	}
+	yystart();
+	return ok;
+}
+
+static FILE *fopen_regular_file(const char *fname) {
+	struct stat buf;
+	
+	if (stat(fname, &buf) != 0) return 0;
+	if ((buf.st_mode & S_IFMT) != S_IFREG) return 0;
+	return fopen(fname, "r");
+}
+
+int
+NextStreamTokenObtained(void) {
+	return yylex();
+}
+
+void
+CloseStream(void) {
+	if (yyin) {
+		fclose(yyin);
+		yyin = 0;
+	}
+}

+ 17 - 0
utils/sim_pasc/stream.h

@@ -0,0 +1,17 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: stream.h,v 2.4 1998/02/03 14:28:36 dick Exp $
+*/
+
+/*
+	Interface of the stream module.
+
+	Implements the direct interaction with the lexical
+	module.  It supplies the routines below.
+*/
+
+#include	"token.h"
+
+extern int OpenStream(const char *);
+extern int NextStreamTokenObtained(void);
+extern void CloseStream(void);

+ 17 - 0
utils/sim_pasc/sysidf.mk

@@ -0,0 +1,17 @@
+#	I N S T A L L A T I O N   P A R A M E T E R S
+
+BINDIR =	/home/dick/bin.`arch`
+MANDIR =	/home/dick/man/man1
+FTPDIR =	/usr/local/ftpd/pub/dick/similarity_tester
+
+#	C O M P I L A T I O N   P A R A M E T E R S
+
+EXE =		#
+CC =		gcc -pedantic -Wall
+LEX =		flex
+COPY =		cp -p
+ZIP =		zip -o
+LINT =		lint -ansi $(SYSTEM)
+LINTFLAGS =	-xh
+
+SYSTEM =	-DUNIX

+ 17 - 0
utils/sim_pasc/sysidf.msdos

@@ -0,0 +1,17 @@
+#	I N S T A L L A T I O N   P A R A M E T E R S
+
+BINDIR =	/com
+MANDIR =	/man
+
+
+#	C O M P I L A T I O N   P A R A M E T E R S
+
+EXE =		.exe
+CC =		gcc -pedantic -Wall
+LEX =		flex
+COPY =		xcopy
+ZIP =		pkzip -ko
+ATFILEARGS =	gcc.exe:ar.exe:lint.exe## use DOS at-convention for these
+LINT =		lint -ansi $(SYSTEM)
+
+SYSTEM =	-DMSDOS

+ 19 - 0
utils/sim_pasc/sysidf.unix

@@ -0,0 +1,19 @@
+#	I N S T A L L A T I O N   P A R A M E T E R S
+
+BINDIR =	/home/dick/bin.`arch`
+MANDIR =	/home/dick/man/man1
+FTPDIR =	/usr/local/ftpd/pub/dick/similarity_tester
+FTPFILES =	README.1st READ_ME TechnReport
+VERSION =	2_19
+
+#	C O M P I L A T I O N   P A R A M E T E R S
+
+EXE =		#
+CC =		gcc -pedantic -Wall
+LEX =		flex
+COPY =		cp -p
+ZIP =		zip -o
+LINT =		lint -ansi $(SYSTEM)
+LINTFLAGS =	-xh
+
+SYSTEM =	-DUNIX

+ 20 - 0
utils/sim_pasc/system.par

@@ -0,0 +1,20 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: system.par,v 1.2 2001/09/28 09:03:55 dick Exp $
+*/
+
+/*	Operating-system dependent data */
+
+#ifdef	UNIX
+
+#define	int32		int		/* type of a 32 bits signed int */
+#define	NULLFILE	"/dev/null"
+
+#endif
+
+#ifdef	MSDOS		/* GNU gcc */
+
+#define	int32		int		/* type of a 32 bits signed int */
+#define	NULLFILE	"nul"
+
+#endif

+ 236 - 0
utils/sim_pasc/text.c

@@ -0,0 +1,236 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: text.c,v 1.2 2001/11/13 12:55:58 dick Exp $
+*/
+
+#include	<stdio.h>
+#include	<malloc.h>
+
+#include	"debug.par"
+#include	"sim.h"
+#include	"token.h"
+#include	"stream.h"
+#include	"lex.h"
+#include	"options.h"
+#include	"error.h"
+#include	"text.h"
+
+struct newline {
+	unsigned char nl_tk_diff;	/* token position difference */
+};
+
+#define	NL_INCR		1000		/* increment of newline buffer size */
+
+static struct newline *nl_buff;		/* to be filled by malloc */
+static unsigned int nl_size;		/* size of nl_buff[] */
+static unsigned int nl_free;		/* next free position in nl_buff[] */
+
+static unsigned int nl_next, nl_limit;	/* nl_buff[] pointers during pass 2 */
+
+static void store_newline(void);
+static void init_nl_buff(void);
+
+/*							TEXT INTERFACE */
+
+static unsigned int last_tk_cnt;	/* token count at newline */
+static unsigned int last_nl_cnt;	/* nl counter during pass 2 */
+
+void
+InitText(int nfiles) {
+	/* allocate the array of text descriptors */
+	NumberOfTexts = nfiles;
+	Text = (struct text *)
+		malloc((unsigned int)(NumberOfTexts*sizeof (struct text)));
+	if (!Text) fatal("out of memory");
+
+	init_nl_buff();
+}
+
+int
+OpenText(enum Pass pass, struct text *txt) {
+	switch (pass) {
+	case First:
+		last_tk_cnt = 0;
+		if (nl_buff) {
+			txt->tx_nl_start = nl_free;
+		}
+		break;
+
+	case Second:
+		last_tk_cnt = 0;
+		if (nl_buff) {
+			nl_next = txt->tx_nl_start;
+			nl_limit = txt->tx_nl_limit;
+			last_nl_cnt = 1;
+			lex_nl_cnt = 1;
+			lex_tk_cnt = 0;
+			return 1;
+		}
+		break;
+	}
+
+	return OpenStream(txt->tx_fname);
+}
+
+int
+NextTextTokenObtained(enum Pass pass) {
+	register int ok = 0;	/* gcc does not understand enum Pass */
+
+	switch (pass) {
+	case First:
+		ok = NextStreamTokenObtained();
+		if (TOKEN_EQ(lex_token, EOL)) {
+			store_newline();
+			last_tk_cnt = lex_tk_cnt;
+		}
+		break;
+
+	case Second:
+		/* get newline info from the buffer or from the file itself */
+		if (nl_buff) {
+			if (nl_next == nl_limit) {
+				ok = 0;
+			}
+			else {
+				struct newline *nl = &nl_buff[nl_next++];
+
+				lex_nl_cnt = ++last_nl_cnt;
+				lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff);
+				lex_token = EOL;
+				ok = 1;
+			}
+		}
+		else {
+			while (	(ok = NextStreamTokenObtained())
+			&&	!TOKEN_EQ(lex_token, EOL)
+			) {
+				/* skip */
+			}
+		}
+		break;
+	}
+
+	return ok;
+}
+
+void
+CloseText(enum Pass pass, struct text *txt) {
+	switch (pass) {
+	case First:
+		if (nl_buff) {
+			if (last_tk_cnt != lex_tk_cnt) {
+				/* there were tokens after the last newline */
+				store_newline();
+			}
+			txt->tx_nl_limit = nl_free;
+		}
+		break;
+	case Second:
+		break;
+	}
+	CloseStream();
+}
+
+/*							NEWLINE CACHING */
+
+/*	To speed up pass2 which is interested in token positions at line ends,
+	the newline buffer keeps this info from pass1. To reduce the size of
+	the newline buffer, the info is kept as the differences of the values
+	at consecutive line ends. This allows unsigned chars to be used rather
+	than integers.
+
+	The recording of token position differences at EOL is optional, and
+	is switched off if
+	-	there is not room enough for the newline buffer.
+	-	a difference would not fit in the field in the struct.
+	Switching off is done by freeing the buffer and setting nl_buff to 0.
+	Anybody using nl_buff should therefore test for nl_buff being zero.
+*/
+
+static void abandon_nl_buff(void);
+
+static void
+init_nl_buff(void) {
+	/* Allocate the newline buffer, if possible */
+	nl_size = 0 + NL_INCR;
+	nl_buff = (option_set('x') ? 0 :
+		(struct newline *)malloc(sizeof (struct newline) * nl_size)
+	);
+}
+
+static void
+store_newline(void) {
+	if (!nl_buff) return;
+
+	if (nl_free == nl_size) {
+		/* allocated array is full; try to increase its size */
+		unsigned int new_size = nl_size + NL_INCR;
+		struct newline *new_buff = (struct newline *)realloc(
+			(char *)nl_buff,
+			sizeof (struct newline) * new_size
+		);
+
+		if (!new_buff) {
+			/* we failed */
+			abandon_nl_buff();
+			return;
+		}
+		nl_buff = new_buff, nl_size = new_size;
+	}
+
+	/* now we are sure there is room enough */
+	{
+		register struct newline *nl = &nl_buff[nl_free++];
+		register unsigned int tk_diff = lex_tk_cnt - last_tk_cnt;
+
+		nl->nl_tk_diff = tk_diff;
+		if (nl->nl_tk_diff != tk_diff) {
+			/* tk_diff does not fit in nl_tk_diff */
+			abandon_nl_buff();
+		}
+	}
+}
+
+static void
+abandon_nl_buff(void) {
+	if (nl_buff) {
+		free((char *)nl_buff);
+		nl_buff = 0;
+	}
+}
+
+#ifdef	DB_NL_BUFF
+
+void
+db_print_nl_buff(unsigned int start, unsigned int limit) {
+	int i;
+
+	fprintf(DebugFile, "\n**** DB_NL_BUFF ****\n");
+	if (!nl_buff) {
+		fprintf(DebugFile, ">>>> NO NL_BUFF\n\n");
+		return;
+	}
+
+	if (start > nl_free) {
+		fprintf(DebugFile, ">>>> start (%u) > nl_free (%u)\n\n",
+			start, nl_free
+		);
+		return;
+	}
+	if (limit > nl_free) {
+		fprintf(DebugFile, ">>>> limit (%u) > nl_free (%u)\n\n",
+			limit, nl_free
+		);
+		return;
+	}
+
+	fprintf(DebugFile, "nl_buff: %u entries:\n", nl_free);
+	for (i = start; i < limit; i++) {
+		struct newline *nl = &nl_buff[i];
+
+		fprintf(DebugFile, "nl_tk_diff = %d\n", nl->nl_tk_diff);
+	}
+	fprintf(DebugFile, "\n");
+}
+
+#endif	/* DB_NL_BUFF */

+ 20 - 0
utils/sim_pasc/text.h

@@ -0,0 +1,20 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: text.h,v 1.2 2001/09/28 09:03:56 dick Exp $
+*/
+
+/*	Implements the access to the lexical scanner.
+	Additionally, the module tries to save newline information,
+	anticipating a second scan which is interested in this
+	information only.
+*/
+
+extern void InitText(int nfiles);
+enum Pass {First, Second};
+extern int OpenText(enum Pass pass, struct text *txt);
+extern int NextTextTokenObtained(enum Pass pass);
+extern void CloseText(enum Pass pass, struct text *txt);
+
+#ifdef	DB_NL_BUFF
+extern void db_print_nl_buff(unsigned int start, unsigned int limit);
+#endif

+ 72 - 0
utils/sim_pasc/textlang.l

@@ -0,0 +1,72 @@
+%{
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: textlang.l,v 1.3 2007/08/29 09:10:36 dick Exp $
+*/
+
+/*
+	Text front end for the similarity tester.
+*/
+
+#include	"language.h"
+#include	"token.h"
+#include	"idf.h"
+#include	"lex.h"
+#include	"lang.h"
+
+/* Language-dependent Code */
+
+void
+InitLanguage(void) {
+}
+
+/*ARGSUSED*/
+int
+MayBeStartOfRun(TOKEN tk) {
+	/* any token is acceptable */
+	return 1;
+}
+
+/*ARGSUSED*/
+unsigned int
+CheckRun(const TOKEN *str, unsigned int size) {
+	/* any run is acceptable */
+	return size;
+}
+
+%}
+
+%option nounput
+%option never-interactive
+
+Layout		([ \t\r\f])
+
+%%
+
+[^ \t\n]+	{			/* a word */
+		/*	a word is defined as anything not containing
+			layout
+		*/
+		return_tk(idf_hashed(yytext));
+	}
+
+\n	{				/* count newlines */
+		return_eol();
+	}
+
+{Layout}	{			/* ignore layout */
+	}
+
+%%
+
+/* Language-INdependent Code */
+
+void
+yystart(void) {
+	BEGIN INITIAL;
+}
+
+int
+yywrap(void) {
+	return 1;
+}

+ 44 - 0
utils/sim_pasc/token.c

@@ -0,0 +1,44 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: token.c,v 2.4 2001/11/13 12:55:58 dick Exp $
+*/
+
+/*
+	Token interface, implementation part.
+*/
+
+#include	<stdio.h>
+
+#include	"token.h"
+
+void
+print_token(FILE *ofile, TOKEN tk) {
+	/*	prints a token, in two characters:
+			normal char		meta (bit 8 set)
+			^A	cntl		$A	meta-cntl
+			 A	printable	#A	meta
+			^?	DEL		$?	meta-DEL
+	*/
+	register int ch =   TOKEN2int(tk) & 0177;
+	register int meta = TOKEN2int(tk) & 0200;
+
+	if (' ' <= ch && ch <= '~') {
+		fprintf(ofile, "%c%c", (meta ? '#' : ' '), ch);
+	}
+	else {
+		fprintf(ofile, "%c%c",
+			(meta ? '$' : '^'),
+			(ch == 0177 ? '?' : ch + '@')
+		);
+	}
+}
+
+#ifdef	TESTTOKEN
+
+int
+TOKEN_EQ(TOKEN t1, TOKEN t2) {
+	/* to make sure TOKEN_EQ is indeed called with two TOKEN parameters */
+	return TOKEN2int(t1) == TOKEN2int(t2);
+}
+
+#endif	/* TESTTOKEN */

+ 52 - 0
utils/sim_pasc/token.h

@@ -0,0 +1,52 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: token.h,v 2.4 2001/11/13 12:55:59 dick Exp $
+*/
+
+/*
+	Token interface.
+	Since the definition of a token has been a continual source of
+	problems, it is now defined as an Abstract Data Type.
+	To allow stronger type checking, there is a special version for use
+	by lint.
+*/
+
+#include	<stdio.h>
+
+#ifndef	TOKEN
+
+#ifdef	lint
+#define	TESTTOKEN
+#endif
+
+#ifdef	TESTTOKEN				/* strict version */
+
+struct cccc {
+	int cccc;
+};
+
+typedef struct cccc *lintTOKEN;
+#define	TOKEN		lintTOKEN
+#define	TOKEN2int(c)	((int)(c))
+#define	int2TOKEN(i)	((TOKEN)(i))
+extern int TOKEN_EQ(TOKEN t1, TOKEN t2);
+
+#else						/* production version */
+
+#define	TOKEN		unsigned char
+#define	TOKEN2int(c)	((c)&0377)
+#define	int2TOKEN(i)	((TOKEN)(i))
+#define	TOKEN_EQ(t1,t2)	(TOKEN2int(t1) == TOKEN2int(t2))
+
+#endif	/* TESTTOKEN */
+
+#endif	/* TOKEN */
+
+/* Macros for the composition of tokens */
+#define	NORM(ch)	int2TOKEN((ch)&0377)
+#define	CTRL(ch)	int2TOKEN((ch)&0037)
+#define	META(ch)	int2TOKEN((ch)|0200)
+#define	MTCT(ch)	int2TOKEN(((ch)&0037)|0200)
+#define	NOTOKEN		int2TOKEN(0)
+
+extern void print_token(FILE *ofile, TOKEN tk);	/* in two characters */

+ 52 - 0
utils/sim_pasc/tokenarray.c

@@ -0,0 +1,52 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: tokenarray.c,v 1.2 2001/11/13 12:55:59 dick Exp $
+*/
+
+#include	<malloc.h>
+
+#include	"error.h"
+#include	"lex.h"
+#include	"tokenarray.h"
+
+#define	TK_INCR		10000		/* increment of token array size */
+
+TOKEN *TokenArray;			/* to be filled by malloc */
+static unsigned int tk_size;		/* size of TokenArray[] */
+static unsigned int tk_free;		/* next free position in TokenArray[] */
+
+void
+InitTokenArray(void) {
+	tk_size = TK_INCR;
+	TokenArray = (TOKEN *)malloc(sizeof (TOKEN) * tk_size);
+	if (!TokenArray) fatal("out of memory");
+	tk_free = 1;		/* don't use position 0 */
+}
+
+void
+StoreToken(void) {
+	if (tk_free == tk_size) {
+		/* allocated array is full; try to increase its size */
+		unsigned int new_size = tk_size + TK_INCR;
+		register TOKEN *new_array = (TOKEN *)realloc(
+			(char *)TokenArray,
+			sizeof (TOKEN) * new_size
+		);
+
+		if (new_size < tk_free)
+			fatal("internal error: TK_INCR causes numeric overflow");
+		if (!new_array) {
+			/* we failed */
+			fatal("out of memory");
+		}
+		TokenArray = new_array, tk_size = new_size;
+	}
+
+	/* now we are sure there is room enough */
+	TokenArray[tk_free++] = lex_token;
+}
+
+unsigned int
+TextLength(void) {
+	return tk_free;
+}

+ 13 - 0
utils/sim_pasc/tokenarray.h

@@ -0,0 +1,13 @@
+/*	This file is part of the software similarity tester SIM.
+	Written by Dick Grune, Vrije Universiteit, Amsterdam.
+	$Id: tokenarray.h,v 1.1 2001/09/28 09:03:42 dick Exp $
+*/
+
+#include	"token.h"
+
+/* Interface for the token storage */
+extern void InitTokenArray(void);
+extern void StoreToken(void);
+extern unsigned int TextLength(void);	/* also first free token position */
+extern TOKEN *TokenArray;
+