123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319 |
- %{
- /* This file is part of the software similarity tester SIM.
- Written by Dick Grune, Vrije Universiteit, Amsterdam.
- $Id: m2lang.l,v 2.9 2007/08/29 09:10:33 dick Exp $
- */
- /*
- Modula-2 language front end for the similarity tester.
- Author: Dick Grune <[email protected]>
- */
- #include "options.h"
- #include "algollike.h"
- #include "token.h"
- #include "idf.h"
- #include "lex.h"
- #include "lang.h"
- /* Language-dependent Code */
- /* Most Modula-2 programs start with a number of IMPORTs that look
- very similar from program to program. These are skipped by ignoring
- the reserved words IMPLEMENTATION, DEFINITION, MODULE, IMPORT
- and FROM, having a flag skip_imports, and start reacting only
- at the first non-ignored reserved word.
- Also, the nesting comments require a state variable.
- */
- /* Additional state variables, set in yystart() */
- static int skip_imports;
- static int comment_level;
- /* Data for module idf */
- static const struct idf reserved[] = {
- {"AND", NORM('&')},
- {"ARRAY", NORM('A')},
- {"BEGIN", NORM('{')},
- {"BY", NORM('B')},
- {"CASE", NORM('c')},
- {"CONST", NORM('C')},
- {"DEFINITION", SKIP},
- {"DIV", NORM('/')},
- {"DO", NORM('D')},
- {"ELSE", NORM('e')},
- {"ELSIF", NORM('e')},
- {"END", NORM('}')},
- {"EXIT", NORM('E')},
- {"EXPORT", CTRL('E')},
- {"FOR", NORM('F')},
- {"FROM", SKIP},
- {"IF", NORM('i')},
- {"IMPLEMENTATION", SKIP},
- {"IMPORT", SKIP},
- {"IN", NORM('I')},
- {"LOOP", NORM('l')},
- {"MOD", NORM('%')},
- {"MODULE", SKIP},
- {"NOT", NORM('~')},
- {"OF", SKIP},
- {"OR", NORM('O')},
- {"POINTER", NORM('p')},
- {"PROCEDURE", NORM('P')},
- {"QUALIFIED", NORM('q')},
- {"RECORD", NORM('r')},
- {"REPEAT", NORM('R')},
- {"RETURN", CTRL('r')},
- {"SET", NORM('s')},
- {"THEN", SKIP},
- {"TO", NORM('t')},
- {"TYPE", NORM('T')},
- {"UNTIL", NORM('u')},
- {"VAR", NORM('v')},
- {"WHILE", NORM('w')},
- {"WITH", NORM('W')},
- };
- static const struct idf standard[] = {
- {"ABS", META('a')},
- {"ADDRESS", META('A')},
- {"ALLOCATE", MTCT('A')},
- {"BITSET", META('b')},
- {"BOOLEAN", META('B')},
- {"CAP", META('c')},
- {"CARDINAL", META('C')},
- {"CHAR", MTCT('C')},
- {"CHR", META('x')},
- {"DEALLOCATE", META('d')},
- {"DEC", META('D')},
- {"EXCL", META('e')},
- {"FALSE", META('f')},
- {"FLOAT", META('F')},
- {"HALT", META('h')},
- {"HIGH", META('H')},
- {"INC", META('i')},
- {"INCL", META('I')},
- {"INTEGER", MTCT('I')},
- {"LONGCARD", META('L')},
- {"LONGINT", META('L')},
- {"LONGREAL", META('L')},
- {"MAX", META('m')},
- {"MIN", META('M')},
- {"NEWPROCESS", META('n')},
- {"NIL", META('N')},
- {"ODD", META('o')},
- {"ORD", META('O')},
- {"PROC", META('p')},
- {"REAL", META('r')},
- {"SIZE", META('s')},
- {"SYSTEM", META('S')},
- {"TRANSFER", META('t')},
- {"TRUE", META('T')},
- {"TRUNC", MTCT('T')},
- {"VAL", META('v')},
- {"WORD", META('w')}
- };
- /* Special treatment of identifiers */
- static TOKEN
- idf2token(int hashing) {
- register TOKEN tk;
- /* the token can be on two lists, reserved and standard */
- tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
- /* is it one of the keywords to be ignored? */
- if (TOKEN_EQ(tk, SKIP)) return tk;
- /* The statement below is a significant comment
- on the value of state variables.
- */
- if (!TOKEN_EQ(tk, IDF)) {
- /* reserved word, stop the skipping */
- skip_imports = 0;
- }
- else {
- /* it is an identifier but not a reserved word */
- if (skip_imports) {
- /* skip it */
- tk = 0;
- }
- else {
- /* look further */
- tk = idf_in_list(yytext, standard, sizeof standard, IDF);
- if (TOKEN_EQ(tk, IDF) && hashing) {
- /* return a one-token hash code */
- tk = idf_hashed(yytext);
- }
- }
- }
- return tk;
- }
- /* Token sets for module algollike */
- const TOKEN NonFinals[] = {
- IDF, /* identifier */
- NORM('{'), /* also BEGIN */
- NORM('('),
- NORM('['),
- NORM('A'), /* ARRAY */
- NORM('c'), /* CASE */
- NORM('C'), /* CONST */
- NORM('E'), /* EXIT */
- NORM('F'), /* FOR */
- NORM('i'), /* IF */
- NORM('l'), /* LOOP */
- NORM('p'), /* POINTER */
- NORM('P'), /* PROCEDURE */
- NORM('r'), /* RECORD */
- NORM('R'), /* REPEAT */
- CTRL('R'), /* RETURN */
- NORM('s'), /* SET */
- NORM('T'), /* TYPE */
- NORM('v'), /* VAR */
- NORM('w'), /* WHILE */
- NORM('W'), /* WITH */
- NOTOKEN
- };
- const TOKEN NonInitials[] = {
- NORM('}'),
- NORM(')'),
- NORM(']'),
- NORM(';'),
- NOTOKEN
- };
- const TOKEN Openers[] = {
- NORM('{'),
- NORM('('),
- NORM('['),
- NOTOKEN
- };
- const TOKEN Closers[] = {
- NORM('}'),
- NORM(')'),
- NORM(']'),
- NOTOKEN
- };
- %}
- %option nounput
- %option never-interactive
- %Start Comment
- Layout ([ \t\r\f])
- ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
- AnyQuoted (\\.)
- QuStrChar ([^"\n\\]|{AnyQuoted})
- ApoStrChar ([^'\n\\]|{AnyQuoted})
- StartComment ("(*")
- EndComment ("*)")
- SafeComChar ([^*\n])
- UnsafeComChar ("*")
- Digit ([0-9a-fA-F])
- Idf ([A-Za-z][A-Za-z0-9_]*)
- %%
- {StartComment} { /* See clang.l */
- /* Lex itself is incapable of handling Modula-2's
- nested comments. So let's help it a bit.
- */
- if (comment_level == 0) {
- BEGIN Comment;
- }
- comment_level++;
- }
- <Comment>{SafeComChar}+ { /* safe comment chunk */
- }
- <Comment>{UnsafeComChar} { /* unsafe char, read one by one */
- }
- <Comment>"\n" { /* to break up long comments */
- return_eol();
- }
- <Comment>{EndComment} { /* end-of-comment */
- comment_level--;
- if (comment_level == 0) {
- BEGIN INITIAL;
- }
- }
- \"{QuStrChar}*\" { /* quoted strings */
- return_ch('"');
- }
- \'{ApoStrChar}*\' { /* apostrophed strings */
- return_ch('"');
- }
- {Digit}+("B"|"C"|"H")? { /* numeral, passed as an identifier */
- return_tk(IDF);
- }
- "END"{Layout}*{Idf} { /* ignore identifier after END */
- return_tk(idf_in_list("END", reserved, sizeof reserved, SKIP));
- }
- {Idf}/"(" { /* identifier in front of ( */
- register TOKEN tk;
- tk = idf2token(option_set('F'));
- if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
- }
- {Idf} { /* identifier */
- register TOKEN tk;
- tk = idf2token(0 /* no hashing */);
- if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
- }
- "<>" { /* <>, special equivalence */
- return_ch('#');
- }
- \; { /* semicolon, conditionally ignored */
- if (option_set('f')) return_ch(yytext[0]);
- }
- \n { /* count newlines */
- return_eol();
- }
- {Layout} { /* ignore layout */
- }
- {ASCII95} { /* copy other text */
- if (!skip_imports) return_ch(yytext[0]);
- }
- . { /* count non-ASCII chars */
- lex_non_ascii_cnt++;
- }
- %%
- /* Language-INdependent Code */
- void
- yystart(void) {
- skip_imports = 1;
- comment_level = 0;
- BEGIN INITIAL;
- }
- int
- yywrap(void) {
- return 1;
- }
|