123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- %{
- /* This file is part of the software similarity tester SIM.
- Written by Dick Grune, Vrije Universiteit, Amsterdam.
- $Id: javalang.l,v 1.4 2007/08/29 09:10:32 dick Exp $
- */
- /*
- Java language front end for the similarity tester.
- Author: Dick Grune <[email protected]>
- */
- #include "options.h"
- #include "algollike.h"
- #include "token.h"
- #include "idf.h"
- #include "lex.h"
- #include "lang.h"
- /* Language-dependent Code */
- static const struct idf reserved[] = {
- {"abstract", NORM('a')},
- {"boolean", NORM('b')},
- {"break", NORM('B')},
- {"byte", CTRL('B')},
- {"case", NORM('c')},
- {"catch", NORM('C')},
- {"char", CTRL('C')},
- {"class", META('c')},
- {"continue", META('C')},
- {"default", NORM('d')},
- {"do", NORM('D')},
- {"double", CTRL('D')},
- {"else", NORM('e')},
- {"extends", NORM('E')},
- {"false", NORM('g')}, /* Boolean literal */
- {"final", NORM('f')},
- {"finally", NORM('F')},
- {"float", CTRL('F')},
- {"for", META('f')},
- {"if", NORM('i')},
- {"implements", NORM('I')},
- {"import", CTRL('I')},
- {"instanceof", META('i')},
- {"int", META('I')},
- {"interface", MTCT('I')},
- {"long", NORM('l')},
- {"native", NORM('n')},
- {"new", NORM('N')},
- {"null", CTRL('N')}, /* null literal */
- {"package", NORM('p')},
- {"private", NORM('P')},
- {"protected", CTRL('P')},
- {"public", META('p')},
- {"return", NORM('r')},
- {"short", NORM('s')},
- {"static", NORM('S')},
- {"super", CTRL('S')},
- {"switch", META('s')},
- {"synchronized",META('S')},
- {"this", NORM('t')},
- {"throw", NORM('T')},
- {"throws", CTRL('T')},
- {"true", META('t')}, /* Boolean literal */
- {"void", NORM('v')},
- {"volatile", NORM('V')},
- {"while", NORM('w')}
- };
- /* Special treatment of identifiers */
- static TOKEN
- idf2token(int hashing) {
- register TOKEN tk;
- tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
- if (TOKEN_EQ(tk, IDF) && hashing) {
- /* return a one-token hash code */
- tk = idf_hashed(yytext);
- }
- return tk;
- }
- /* Token sets for module algollike */
- const TOKEN NonFinals[] = {
- IDF, /* identifier */
- NORM('{'),
- NORM('('),
- NORM('a'), /* abstract */
- NORM('b'), /* boolean */
- NORM('B'), /* break */
- CTRL('B'), /* byte */
- NORM('c'), /* case */
- NORM('C'), /* catch */
- CTRL('C'), /* char */
- META('c'), /* class */
- META('C'), /* continue */
- NORM('d'), /* default */
- NORM('D'), /* do */
- CTRL('D'), /* double */
- NORM('e'), /* else */
- NORM('E'), /* extends */
- NORM('f'), /* final */
- NORM('F'), /* finally */
- CTRL('F'), /* float */
- META('f'), /* for */
- NORM('i'), /* if */
- NORM('I'), /* implements */
- CTRL('I'), /* import */
- META('i'), /* instanceof */
- META('I'), /* int */
- MTCT('I'), /* interface */
- NORM('l'), /* long */
- NORM('n'), /* native */
- NORM('N'), /* new */
- NORM('p'), /* package */
- NORM('P'), /* private */
- CTRL('P'), /* protected */
- META('p'), /* public */
- NORM('r'), /* return */
- NORM('s'), /* short */
- NORM('S'), /* static */
- CTRL('S'), /* super */
- META('s'), /* switch */
- META('S'), /* synchronized */
- NORM('T'), /* throw */
- CTRL('T'), /* throws */
- NORM('v'), /* void */
- NORM('V'), /* volatile */
- NORM('w'), /* while */
- NOTOKEN
- };
- const TOKEN NonInitials[] = {
- NORM(')'),
- NORM('}'),
- NORM(';'),
- NOTOKEN
- };
- const TOKEN Openers[] = {
- NORM('{'),
- NORM('('),
- NORM('['),
- NOTOKEN
- };
- const TOKEN Closers[] = {
- NORM('}'),
- NORM(')'),
- NORM(']'),
- NOTOKEN
- };
- %}
- %option nounput
- %option never-interactive
- %Start Comment
- Layout ([ \t\r\f])
- ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
- Digit ([0-9a-fA-F])
- UniCode (\\u{Digit}{Digit}{Digit}{Digit})
- AnyQuoted ((\\.)|{UniCode})
- StrChar ([^"\n\\]|{AnyQuoted})
- ChrChar ([^'\n\\]|{AnyQuoted})
- StartComment ("/*")
- EndComment ("*/")
- SafeComChar ([^*\n])
- UnsafeComChar ("*")
- SingleLineCom ("//".*)
- Idf ([A-Za-z][A-Za-z0-9_]*)
- %%
- {StartComment} {
- /* We do not have one single pattern to match a comment
- (although one can be written), for two reasons.
- The matched string might overflow lex-internal buffers
- like yysbuf and yytext; and the pattern would be very
- complicated and overtax lex.
- So we break up the string into safe chunks and keep
- track of where we are in a start condition <Comment>.
- */
- BEGIN Comment;
- }
- <Comment>{SafeComChar}+ { /* safe comment chunk */
- }
- <Comment>{UnsafeComChar} { /* unsafe char, read one by one */
- }
- <Comment>"\n" { /* to break up long comments */
- return_eol();
- }
- <Comment>{EndComment} { /* end-of-comment */
- BEGIN INITIAL;
- }
- {SingleLineCom}"\n" { /* single-line comment */
- return_eol();
- }
- \"{StrChar}*\" { /* strings */
- return_ch('"');
- }
- \'{ChrChar}+\' { /* characters */
- return_ch('\'');
- }
- (0x)?{Digit}+("l"|"L")? { /* numeral, passed as an identifier */
- return_tk(IDF);
- }
- "import"{Layout}[^;]*; { /* import statement; ignore */
- }
- {Idf}/"(" { /* identifier in front of ( */
- register TOKEN tk;
- tk = idf2token(option_set('F'));
- if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
- }
- {Idf} { /* identifier */
- register TOKEN tk;
- tk = idf2token(0 /* no hashing */);
- if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
- }
- \; { /* semicolon, conditionally ignored */
- if (option_set('f')) return_ch(yytext[0]);
- }
- \n { /* count newlines */
- return_eol();
- }
- {Layout} { /* ignore layout */
- }
- {ASCII95} { /* copy other text */
- return_ch(yytext[0]);
- }
- . { /* count non-ASCII chars */
- lex_non_ascii_cnt++;
- }
- %%
- /* Language-INdependent Code */
- void
- yystart(void) {
- BEGIN INITIAL;
- }
- int
- yywrap(void) {
- return 1;
- }
|