2
0

m2lang.l 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. %{
  2. /* This file is part of the software similarity tester SIM.
  3. Written by Dick Grune, Vrije Universiteit, Amsterdam.
  4. $Id: m2lang.l,v 2.9 2007/08/29 09:10:33 dick Exp $
  5. */
  6. /*
  7. Modula-2 language front end for the similarity tester.
  8. Author: Dick Grune <[email protected]>
  9. */
  10. #include "options.h"
  11. #include "algollike.h"
  12. #include "token.h"
  13. #include "idf.h"
  14. #include "lex.h"
  15. #include "lang.h"
  16. /* Language-dependent Code */
  17. /* Most Modula-2 programs start with a number of IMPORTs that look
  18. very similar from program to program. These are skipped by ignoring
  19. the reserved words IMPLEMENTATION, DEFINITION, MODULE, IMPORT
  20. and FROM, having a flag skip_imports, and start reacting only
  21. at the first non-ignored reserved word.
  22. Also, the nesting comments require a state variable.
  23. */
  24. /* Additional state variables, set in yystart() */
  25. static int skip_imports;
  26. static int comment_level;
  27. /* Data for module idf */
  28. static const struct idf reserved[] = {
  29. {"AND", NORM('&')},
  30. {"ARRAY", NORM('A')},
  31. {"BEGIN", NORM('{')},
  32. {"BY", NORM('B')},
  33. {"CASE", NORM('c')},
  34. {"CONST", NORM('C')},
  35. {"DEFINITION", SKIP},
  36. {"DIV", NORM('/')},
  37. {"DO", NORM('D')},
  38. {"ELSE", NORM('e')},
  39. {"ELSIF", NORM('e')},
  40. {"END", NORM('}')},
  41. {"EXIT", NORM('E')},
  42. {"EXPORT", CTRL('E')},
  43. {"FOR", NORM('F')},
  44. {"FROM", SKIP},
  45. {"IF", NORM('i')},
  46. {"IMPLEMENTATION", SKIP},
  47. {"IMPORT", SKIP},
  48. {"IN", NORM('I')},
  49. {"LOOP", NORM('l')},
  50. {"MOD", NORM('%')},
  51. {"MODULE", SKIP},
  52. {"NOT", NORM('~')},
  53. {"OF", SKIP},
  54. {"OR", NORM('O')},
  55. {"POINTER", NORM('p')},
  56. {"PROCEDURE", NORM('P')},
  57. {"QUALIFIED", NORM('q')},
  58. {"RECORD", NORM('r')},
  59. {"REPEAT", NORM('R')},
  60. {"RETURN", CTRL('r')},
  61. {"SET", NORM('s')},
  62. {"THEN", SKIP},
  63. {"TO", NORM('t')},
  64. {"TYPE", NORM('T')},
  65. {"UNTIL", NORM('u')},
  66. {"VAR", NORM('v')},
  67. {"WHILE", NORM('w')},
  68. {"WITH", NORM('W')},
  69. };
  70. static const struct idf standard[] = {
  71. {"ABS", META('a')},
  72. {"ADDRESS", META('A')},
  73. {"ALLOCATE", MTCT('A')},
  74. {"BITSET", META('b')},
  75. {"BOOLEAN", META('B')},
  76. {"CAP", META('c')},
  77. {"CARDINAL", META('C')},
  78. {"CHAR", MTCT('C')},
  79. {"CHR", META('x')},
  80. {"DEALLOCATE", META('d')},
  81. {"DEC", META('D')},
  82. {"EXCL", META('e')},
  83. {"FALSE", META('f')},
  84. {"FLOAT", META('F')},
  85. {"HALT", META('h')},
  86. {"HIGH", META('H')},
  87. {"INC", META('i')},
  88. {"INCL", META('I')},
  89. {"INTEGER", MTCT('I')},
  90. {"LONGCARD", META('L')},
  91. {"LONGINT", META('L')},
  92. {"LONGREAL", META('L')},
  93. {"MAX", META('m')},
  94. {"MIN", META('M')},
  95. {"NEWPROCESS", META('n')},
  96. {"NIL", META('N')},
  97. {"ODD", META('o')},
  98. {"ORD", META('O')},
  99. {"PROC", META('p')},
  100. {"REAL", META('r')},
  101. {"SIZE", META('s')},
  102. {"SYSTEM", META('S')},
  103. {"TRANSFER", META('t')},
  104. {"TRUE", META('T')},
  105. {"TRUNC", MTCT('T')},
  106. {"VAL", META('v')},
  107. {"WORD", META('w')}
  108. };
  109. /* Special treatment of identifiers */
  110. static TOKEN
  111. idf2token(int hashing) {
  112. register TOKEN tk;
  113. /* the token can be on two lists, reserved and standard */
  114. tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
  115. /* is it one of the keywords to be ignored? */
  116. if (TOKEN_EQ(tk, SKIP)) return tk;
  117. /* The statement below is a significant comment
  118. on the value of state variables.
  119. */
  120. if (!TOKEN_EQ(tk, IDF)) {
  121. /* reserved word, stop the skipping */
  122. skip_imports = 0;
  123. }
  124. else {
  125. /* it is an identifier but not a reserved word */
  126. if (skip_imports) {
  127. /* skip it */
  128. tk = 0;
  129. }
  130. else {
  131. /* look further */
  132. tk = idf_in_list(yytext, standard, sizeof standard, IDF);
  133. if (TOKEN_EQ(tk, IDF) && hashing) {
  134. /* return a one-token hash code */
  135. tk = idf_hashed(yytext);
  136. }
  137. }
  138. }
  139. return tk;
  140. }
  141. /* Token sets for module algollike */
  142. const TOKEN NonFinals[] = {
  143. IDF, /* identifier */
  144. NORM('{'), /* also BEGIN */
  145. NORM('('),
  146. NORM('['),
  147. NORM('A'), /* ARRAY */
  148. NORM('c'), /* CASE */
  149. NORM('C'), /* CONST */
  150. NORM('E'), /* EXIT */
  151. NORM('F'), /* FOR */
  152. NORM('i'), /* IF */
  153. NORM('l'), /* LOOP */
  154. NORM('p'), /* POINTER */
  155. NORM('P'), /* PROCEDURE */
  156. NORM('r'), /* RECORD */
  157. NORM('R'), /* REPEAT */
  158. CTRL('R'), /* RETURN */
  159. NORM('s'), /* SET */
  160. NORM('T'), /* TYPE */
  161. NORM('v'), /* VAR */
  162. NORM('w'), /* WHILE */
  163. NORM('W'), /* WITH */
  164. NOTOKEN
  165. };
  166. const TOKEN NonInitials[] = {
  167. NORM('}'),
  168. NORM(')'),
  169. NORM(']'),
  170. NORM(';'),
  171. NOTOKEN
  172. };
  173. const TOKEN Openers[] = {
  174. NORM('{'),
  175. NORM('('),
  176. NORM('['),
  177. NOTOKEN
  178. };
  179. const TOKEN Closers[] = {
  180. NORM('}'),
  181. NORM(')'),
  182. NORM(']'),
  183. NOTOKEN
  184. };
  185. %}
  186. %option nounput
  187. %option never-interactive
  188. %Start Comment
  189. Layout ([ \t\r\f])
  190. ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
  191. AnyQuoted (\\.)
  192. QuStrChar ([^"\n\\]|{AnyQuoted})
  193. ApoStrChar ([^'\n\\]|{AnyQuoted})
  194. StartComment ("(*")
  195. EndComment ("*)")
  196. SafeComChar ([^*\n])
  197. UnsafeComChar ("*")
  198. Digit ([0-9a-fA-F])
  199. Idf ([A-Za-z][A-Za-z0-9_]*)
  200. %%
  201. {StartComment} { /* See clang.l */
  202. /* Lex itself is incapable of handling Modula-2's
  203. nested comments. So let's help it a bit.
  204. */
  205. if (comment_level == 0) {
  206. BEGIN Comment;
  207. }
  208. comment_level++;
  209. }
  210. <Comment>{SafeComChar}+ { /* safe comment chunk */
  211. }
  212. <Comment>{UnsafeComChar} { /* unsafe char, read one by one */
  213. }
  214. <Comment>"\n" { /* to break up long comments */
  215. return_eol();
  216. }
  217. <Comment>{EndComment} { /* end-of-comment */
  218. comment_level--;
  219. if (comment_level == 0) {
  220. BEGIN INITIAL;
  221. }
  222. }
  223. \"{QuStrChar}*\" { /* quoted strings */
  224. return_ch('"');
  225. }
  226. \'{ApoStrChar}*\' { /* apostrophed strings */
  227. return_ch('"');
  228. }
  229. {Digit}+("B"|"C"|"H")? { /* numeral, passed as an identifier */
  230. return_tk(IDF);
  231. }
  232. "END"{Layout}*{Idf} { /* ignore identifier after END */
  233. return_tk(idf_in_list("END", reserved, sizeof reserved, SKIP));
  234. }
  235. {Idf}/"(" { /* identifier in front of ( */
  236. register TOKEN tk;
  237. tk = idf2token(option_set('F'));
  238. if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
  239. }
  240. {Idf} { /* identifier */
  241. register TOKEN tk;
  242. tk = idf2token(0 /* no hashing */);
  243. if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
  244. }
  245. "<>" { /* <>, special equivalence */
  246. return_ch('#');
  247. }
  248. \; { /* semicolon, conditionally ignored */
  249. if (option_set('f')) return_ch(yytext[0]);
  250. }
  251. \n { /* count newlines */
  252. return_eol();
  253. }
  254. {Layout} { /* ignore layout */
  255. }
  256. {ASCII95} { /* copy other text */
  257. if (!skip_imports) return_ch(yytext[0]);
  258. }
  259. . { /* count non-ASCII chars */
  260. lex_non_ascii_cnt++;
  261. }
  262. %%
  263. /* Language-INdependent Code */
  264. void
  265. yystart(void) {
  266. skip_imports = 1;
  267. comment_level = 0;
  268. BEGIN INITIAL;
  269. }
  270. int
  271. yywrap(void) {
  272. return 1;
  273. }