javalang.l 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. %{
  2. /* This file is part of the software similarity tester SIM.
  3. Written by Dick Grune, Vrije Universiteit, Amsterdam.
  4. $Id: javalang.l,v 1.4 2007/08/29 09:10:32 dick Exp $
  5. */
  6. /*
  7. Java language front end for the similarity tester.
  8. Author: Dick Grune <[email protected]>
  9. */
  10. #include "options.h"
  11. #include "algollike.h"
  12. #include "token.h"
  13. #include "idf.h"
  14. #include "lex.h"
  15. #include "lang.h"
  16. /* Language-dependent Code */
  17. static const struct idf reserved[] = {
  18. {"abstract", NORM('a')},
  19. {"boolean", NORM('b')},
  20. {"break", NORM('B')},
  21. {"byte", CTRL('B')},
  22. {"case", NORM('c')},
  23. {"catch", NORM('C')},
  24. {"char", CTRL('C')},
  25. {"class", META('c')},
  26. {"continue", META('C')},
  27. {"default", NORM('d')},
  28. {"do", NORM('D')},
  29. {"double", CTRL('D')},
  30. {"else", NORM('e')},
  31. {"extends", NORM('E')},
  32. {"false", NORM('g')}, /* Boolean literal */
  33. {"final", NORM('f')},
  34. {"finally", NORM('F')},
  35. {"float", CTRL('F')},
  36. {"for", META('f')},
  37. {"if", NORM('i')},
  38. {"implements", NORM('I')},
  39. {"import", CTRL('I')},
  40. {"instanceof", META('i')},
  41. {"int", META('I')},
  42. {"interface", MTCT('I')},
  43. {"long", NORM('l')},
  44. {"native", NORM('n')},
  45. {"new", NORM('N')},
  46. {"null", CTRL('N')}, /* null literal */
  47. {"package", NORM('p')},
  48. {"private", NORM('P')},
  49. {"protected", CTRL('P')},
  50. {"public", META('p')},
  51. {"return", NORM('r')},
  52. {"short", NORM('s')},
  53. {"static", NORM('S')},
  54. {"super", CTRL('S')},
  55. {"switch", META('s')},
  56. {"synchronized",META('S')},
  57. {"this", NORM('t')},
  58. {"throw", NORM('T')},
  59. {"throws", CTRL('T')},
  60. {"true", META('t')}, /* Boolean literal */
  61. {"void", NORM('v')},
  62. {"volatile", NORM('V')},
  63. {"while", NORM('w')}
  64. };
  65. /* Special treatment of identifiers */
  66. static TOKEN
  67. idf2token(int hashing) {
  68. register TOKEN tk;
  69. tk = idf_in_list(yytext, reserved, sizeof reserved, IDF);
  70. if (TOKEN_EQ(tk, IDF) && hashing) {
  71. /* return a one-token hash code */
  72. tk = idf_hashed(yytext);
  73. }
  74. return tk;
  75. }
  76. /* Token sets for module algollike */
  77. const TOKEN NonFinals[] = {
  78. IDF, /* identifier */
  79. NORM('{'),
  80. NORM('('),
  81. NORM('a'), /* abstract */
  82. NORM('b'), /* boolean */
  83. NORM('B'), /* break */
  84. CTRL('B'), /* byte */
  85. NORM('c'), /* case */
  86. NORM('C'), /* catch */
  87. CTRL('C'), /* char */
  88. META('c'), /* class */
  89. META('C'), /* continue */
  90. NORM('d'), /* default */
  91. NORM('D'), /* do */
  92. CTRL('D'), /* double */
  93. NORM('e'), /* else */
  94. NORM('E'), /* extends */
  95. NORM('f'), /* final */
  96. NORM('F'), /* finally */
  97. CTRL('F'), /* float */
  98. META('f'), /* for */
  99. NORM('i'), /* if */
  100. NORM('I'), /* implements */
  101. CTRL('I'), /* import */
  102. META('i'), /* instanceof */
  103. META('I'), /* int */
  104. MTCT('I'), /* interface */
  105. NORM('l'), /* long */
  106. NORM('n'), /* native */
  107. NORM('N'), /* new */
  108. NORM('p'), /* package */
  109. NORM('P'), /* private */
  110. CTRL('P'), /* protected */
  111. META('p'), /* public */
  112. NORM('r'), /* return */
  113. NORM('s'), /* short */
  114. NORM('S'), /* static */
  115. CTRL('S'), /* super */
  116. META('s'), /* switch */
  117. META('S'), /* synchronized */
  118. NORM('T'), /* throw */
  119. CTRL('T'), /* throws */
  120. NORM('v'), /* void */
  121. NORM('V'), /* volatile */
  122. NORM('w'), /* while */
  123. NOTOKEN
  124. };
  125. const TOKEN NonInitials[] = {
  126. NORM(')'),
  127. NORM('}'),
  128. NORM(';'),
  129. NOTOKEN
  130. };
  131. const TOKEN Openers[] = {
  132. NORM('{'),
  133. NORM('('),
  134. NORM('['),
  135. NOTOKEN
  136. };
  137. const TOKEN Closers[] = {
  138. NORM('}'),
  139. NORM(')'),
  140. NORM(']'),
  141. NOTOKEN
  142. };
  143. %}
  144. %option nounput
  145. %option never-interactive
  146. %Start Comment
  147. Layout ([ \t\r\f])
  148. ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~])
  149. Digit ([0-9a-fA-F])
  150. UniCode (\\u{Digit}{Digit}{Digit}{Digit})
  151. AnyQuoted ((\\.)|{UniCode})
  152. StrChar ([^"\n\\]|{AnyQuoted})
  153. ChrChar ([^'\n\\]|{AnyQuoted})
  154. StartComment ("/*")
  155. EndComment ("*/")
  156. SafeComChar ([^*\n])
  157. UnsafeComChar ("*")
  158. SingleLineCom ("//".*)
  159. Idf ([A-Za-z][A-Za-z0-9_]*)
  160. %%
  161. {StartComment} {
  162. /* We do not have one single pattern to match a comment
  163. (although one can be written), for two reasons.
  164. The matched string might overflow lex-internal buffers
  165. like yysbuf and yytext; and the pattern would be very
  166. complicated and overtax lex.
  167. So we break up the string into safe chunks and keep
  168. track of where we are in a start condition <Comment>.
  169. */
  170. BEGIN Comment;
  171. }
  172. <Comment>{SafeComChar}+ { /* safe comment chunk */
  173. }
  174. <Comment>{UnsafeComChar} { /* unsafe char, read one by one */
  175. }
  176. <Comment>"\n" { /* to break up long comments */
  177. return_eol();
  178. }
  179. <Comment>{EndComment} { /* end-of-comment */
  180. BEGIN INITIAL;
  181. }
  182. {SingleLineCom}"\n" { /* single-line comment */
  183. return_eol();
  184. }
  185. \"{StrChar}*\" { /* strings */
  186. return_ch('"');
  187. }
  188. \'{ChrChar}+\' { /* characters */
  189. return_ch('\'');
  190. }
  191. (0x)?{Digit}+("l"|"L")? { /* numeral, passed as an identifier */
  192. return_tk(IDF);
  193. }
  194. "import"{Layout}[^;]*; { /* import statement; ignore */
  195. }
  196. {Idf}/"(" { /* identifier in front of ( */
  197. register TOKEN tk;
  198. tk = idf2token(option_set('F'));
  199. if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
  200. }
  201. {Idf} { /* identifier */
  202. register TOKEN tk;
  203. tk = idf2token(0 /* no hashing */);
  204. if (!TOKEN_EQ(tk, SKIP)) return_tk(tk);
  205. }
  206. \; { /* semicolon, conditionally ignored */
  207. if (option_set('f')) return_ch(yytext[0]);
  208. }
  209. \n { /* count newlines */
  210. return_eol();
  211. }
  212. {Layout} { /* ignore layout */
  213. }
  214. {ASCII95} { /* copy other text */
  215. return_ch(yytext[0]);
  216. }
  217. . { /* count non-ASCII chars */
  218. lex_non_ascii_cnt++;
  219. }
  220. %%
  221. /* Language-INdependent Code */
  222. void
  223. yystart(void) {
  224. BEGIN INITIAL;
  225. }
  226. int
  227. yywrap(void) {
  228. return 1;
  229. }