scan.l 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. /**
  2. * @file
  3. * @ingroup cgraph_core
  4. */
  5. /*************************************************************************
  6. * Copyright (c) 2011 AT&T Intellectual Property
  7. * All rights reserved. This program and the accompanying materials
  8. * are made available under the terms of the Eclipse Public License v1.0
  9. * which accompanies this distribution, and is available at
  10. * https://www.eclipse.org/legal/epl-v10.html
  11. *
  12. * Contributors: Details at https://graphviz.org
  13. *************************************************************************/
  14. /* requires flex (i.e. not lex) */
  15. /* By default, Flex emits a lexer using symbols prefixed with "yy". Graphviz
  16. * contains multiple Flex-generated lexers, so we alter this prefix to avoid
  17. * symbol clashes.
  18. */
  19. %option prefix="aag"
  20. /* Avoid generating an unused input function. See
  21. https://westes.github.io/flex/manual/Scanner-Options.html
  22. */
  23. %option noinput
  24. %{
  25. #include <assert.h>
  26. #include <grammar.h>
  27. #include <cgraph/cghdr.h>
  28. #include <cgraph/gv_ctype.h>
  29. #include <stdbool.h>
  30. #include <stddef.h>
  31. #include <string.h>
  32. #include <util/agxbuf.h>
  33. #include <util/startswith.h>
  34. // #define YY_BUF_SIZE 128000
  35. #define GRAPH_EOF_TOKEN '@' /* lex class must be defined below */
  36. /* this is a workaround for linux flex */
  37. static int line_num = 1;
  38. static int html_nest = 0; /* nesting level for html strings */
  39. static const char* InputFile;
  40. static Agdisc_t *Disc;
  41. static void *Ifile;
  42. static int graphType;
  43. /* By default, Flex calls isatty() to determine whether the input it is
  44. * scanning is coming from the user typing or from a file. However, our input
  45. * is being provided by Graphviz' I/O channel mechanism, which does not have a
  46. * valid file descriptor that supports isatty().
  47. */
  48. #define isatty(x) gv_isatty_suppression
  49. int gv_isatty_suppression;
  50. #ifndef YY_INPUT
  51. #define YY_INPUT(buf,result,max_size) \
  52. if ((result = Disc->io->afread(Ifile, buf, max_size)) < 0) \
  53. YY_FATAL_ERROR( "input in flex scanner failed" )
  54. #endif
  55. /* buffer for arbitrary length strings (longer than BUFSIZ) */
  56. static agxbuf Sbuf;
  57. static void beginstr(void);
  58. static void addstr(char *src);
  59. static void endstr(void);
  60. static void endstr_html(void);
  61. static void storeFileName(char* fname, size_t len);
  62. /* ppDirective:
  63. * Process a possible preprocessor line directive.
  64. * aagtext = #.*
  65. */
  66. static void ppDirective (void);
  67. /* twoDots:
  68. * Return true if token has more than one '.';
  69. * we know the last character is a '.'.
  70. */
  71. static bool twoDots(void);
  72. /* chkNum:
  73. * The regexp for NUMBER allows a terminating letter or '.'.
  74. * This way we can catch a number immediately followed by a name
  75. * or something like 123.456.78, and report this to the user.
  76. */
  77. static int chkNum(void);
  78. /* The LETTER class below consists of ascii letters, underscore, all non-ascii
  79. * characters. This allows identifiers to have characters from any
  80. * character set independent of locale. The downside is that, for certain
  81. * character sets, non-letter and, in fact, undefined characters will be
  82. * accepted. This is not likely and, from dot's stand, shouldn't do any
  83. * harm. (Presumably undefined characters will be ignored in display.) And,
  84. * it allows a greater wealth of names. */
  85. %}
  86. GRAPH_EOF_TOKEN [@]
  87. LETTER [A-Za-z_\200-\377]
  88. DIGIT [0-9]
  89. NAME {LETTER}({LETTER}|{DIGIT})*
  90. NUMBER [-]?(({DIGIT}+(\.{DIGIT}*)?)|(\.{DIGIT}+))(\.|{LETTER})?
  91. ID ({NAME}|{NUMBER})
  92. %x comment
  93. %x qstring
  94. %x hstring
  95. %%
  96. {GRAPH_EOF_TOKEN} return(EOF);
  97. <INITIAL,comment>\n line_num++;
  98. "/*" BEGIN(comment);
  99. <comment>[^*\n]* /* eat anything not a '*' */
  100. <comment>"*"+[^*/\n]* /* eat up '*'s not followed by '/'s */
  101. <comment>"*"+"/" BEGIN(INITIAL);
  102. "//".* /* ignore C++-style comments */
  103. ^"#".* ppDirective ();
  104. "#".* /* ignore shell-like comments */
  105. [ \t\r] /* ignore whitespace */
  106. "\xEF\xBB\xBF" /* ignore BOM */
  107. "node" return(T_node); /* see tokens in agcanonstr */
  108. "edge" return(T_edge);
  109. "graph" if (!graphType) graphType = T_graph; return(T_graph);
  110. "digraph" if (!graphType) graphType = T_digraph; return(T_digraph);
  111. "strict" return(T_strict);
  112. "subgraph" return(T_subgraph);
  113. "->" if (graphType == T_digraph) return(T_edgeop); else return('-');
  114. "--" if (graphType == T_graph) return(T_edgeop); else return('-');
  115. {NAME} { aaglval.str = agstrdup(Ag_G_global,aagget_text()); return(T_atom); }
  116. {NUMBER} { if (chkNum()) yyless(aagget_leng()-1); aaglval.str = agstrdup(Ag_G_global,aagget_text()); return(T_atom); }
  117. ["] BEGIN(qstring); beginstr();
  118. <qstring>["] BEGIN(INITIAL); endstr(); return (T_qatom);
  119. <qstring>[\\]["] addstr ("\"");
  120. <qstring>[\\][\\] addstr ("\\\\");
  121. <qstring>[\\][\n] line_num++; /* ignore escaped newlines */
  122. <qstring>[\n] addstr ("\n"); line_num++;
  123. <qstring>([^"\\\n]*|[\\]) addstr(aagget_text());
  124. [<] BEGIN(hstring); html_nest = 1; beginstr();
  125. <hstring>[>] html_nest--; if (html_nest) addstr(aagget_text()); else {BEGIN(INITIAL); endstr_html(); return (T_qatom);}
  126. <hstring>[<] html_nest++; addstr(aagget_text());
  127. <hstring>[\n] addstr(aagget_text()); line_num++; /* add newlines */
  128. <hstring>([^><\n]*) addstr(aagget_text());
  129. . return aagget_text()[0];
  130. %%
  131. void aagerror(const char *str);
  132. void aagerror(const char *str)
  133. {
  134. agxbuf xb = {0};
  135. if (InputFile) {
  136. agxbprint (&xb, "%s: ", InputFile);
  137. }
  138. agxbprint (&xb, "%s in line %d", str, line_num);
  139. if (*aagget_text()) {
  140. agxbprint(&xb, " near '%s'", aagget_text());
  141. }
  142. else switch (YYSTATE) {
  143. case qstring: {
  144. agxbprint(&xb, " scanning a quoted string (missing endquote? longer than %d?)", YY_BUF_SIZE);
  145. if (agxblen(&Sbuf) > 0) {
  146. agxbprint(&xb, "\nString starting:\"%.80s", agxbuse(&Sbuf));
  147. }
  148. break;
  149. }
  150. case hstring: {
  151. agxbprint(&xb, " scanning a HTML string (missing '>'? bad nesting? longer than %d?)", YY_BUF_SIZE);
  152. if (agxblen(&Sbuf) > 0) {
  153. agxbprint(&xb, "\nString starting:<%.80s", agxbuse(&Sbuf));
  154. }
  155. break;
  156. }
  157. case comment :
  158. agxbprint(&xb, " scanning a /*...*/ comment (missing '*/? longer than %d?)", YY_BUF_SIZE);
  159. break;
  160. default: // nothing extra to note
  161. break;
  162. }
  163. agxbputc (&xb, '\n');
  164. agerrorf("%s", agxbuse(&xb));
  165. agxbfree(&xb);
  166. BEGIN(INITIAL);
  167. }
  168. /* must be here to see flex's macro defns */
  169. void aglexeof(void) { unput(GRAPH_EOF_TOKEN); }
  170. void aglexbad(void) { YY_FLUSH_BUFFER; }
  171. #ifndef YY_CALL_ONLY_ARG
  172. # define YY_CALL_ONLY_ARG void
  173. #endif
  174. int aagwrap(YY_CALL_ONLY_ARG)
  175. {
  176. return 1;
  177. }
  178. /* Reset line number */
  179. void agreadline(int n) { line_num = n; }
  180. /* (Re)set file:
  181. */
  182. void agsetfile(const char* f) { InputFile = f; line_num = 1; }
  183. /* There is a hole here, because switching channels
  184. * requires pushing back whatever was previously read.
  185. * There probably is a right way of doing this.
  186. */
  187. void aglexinit(Agdisc_t *disc, void *ifile) { Disc = disc; Ifile = ifile; graphType = 0;}
  188. static void beginstr(void) {
  189. // nothing required, but we should not have pending string data
  190. assert(agxblen(&Sbuf) == 0 &&
  191. "pending string data that was not consumed (missing "
  192. "endstr()/endhtmlstr()?)");
  193. }
  194. static void addstr(char *src) {
  195. agxbput(&Sbuf, src);
  196. }
  197. static void endstr(void) {
  198. aaglval.str = agstrdup(Ag_G_global, agxbuse(&Sbuf));
  199. }
  200. static void endstr_html(void) {
  201. aaglval.str = agstrdup_html(Ag_G_global, agxbuse(&Sbuf));
  202. }
  203. static void storeFileName(char* fname, size_t len) {
  204. static size_t cnt;
  205. static char* buf;
  206. if (len > cnt) {
  207. buf = gv_realloc(buf, cnt + 1, len + 1);
  208. cnt = len;
  209. }
  210. strcpy (buf, fname);
  211. InputFile = buf;
  212. }
  213. /* ppDirective:
  214. * Process a possible preprocessor line directive.
  215. * aagtext = #.*
  216. */
  217. static void ppDirective (void)
  218. {
  219. int r, cnt, lineno;
  220. char buf[2];
  221. char* s = aagget_text() + 1; /* skip initial # */
  222. if (startswith(s, "line")) s += strlen("line");
  223. r = sscanf(s, "%d %1[\"]%n", &lineno, buf, &cnt);
  224. if (r > 0) { /* got line number */
  225. // ignore if line number was out of range
  226. if (lineno <= 0) {
  227. return;
  228. }
  229. line_num = lineno - 1;
  230. if (r > 1) { /* saw quote */
  231. char* p = s + cnt;
  232. char* e = p;
  233. while (*e && *e != '"') e++;
  234. if (e != p && *e == '"') {
  235. *e = '\0';
  236. storeFileName(p, (size_t)(e - p));
  237. }
  238. }
  239. }
  240. }
  241. /* twoDots:
  242. * Return true if token has more than one '.';
  243. * we know the last character is a '.'.
  244. */
  245. static bool twoDots(void) {
  246. const char *dot = strchr(aagget_text(), '.');
  247. // was there a dot and was it not the last character?
  248. return dot != NULL && dot != &aagget_text()[aagget_leng() - 1];
  249. }
  250. /* chkNum:
  251. * The regexp for NUMBER allows a terminating letter or '.'.
  252. * This way we can catch a number immediately followed by a name
  253. * or something like 123.456.78, and report this to the user.
  254. */
  255. static int chkNum(void) {
  256. char c = aagget_text()[aagget_leng() - 1]; // last character
  257. if ((!gv_isdigit(c) && c != '.') || (c == '.' && twoDots())) { // c is letter
  258. const char* fname;
  259. if (InputFile)
  260. fname = InputFile;
  261. else
  262. fname = "input";
  263. agwarningf("syntax ambiguity - badly delimited number '%s' in line %d of "
  264. "%s splits into two tokens\n", aagget_text(), line_num, fname);
  265. return 1;
  266. }
  267. else return 0;
  268. }