lexrules.pas 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629
  1. {
  2. Parser for Lex grammar rules.
  3. This module implements a parser for Lex grammar rules. It should
  4. probably be reimplemented using Lex and Yacc, but the irregular
  5. lexical structure of the Lex language makes that rather tedious,
  6. so I decided to use a conventional recursive-descent-parser
  7. instead.
  8. Copyright (c) 1990-92 Albert Graef <[email protected]>
  9. Copyright (C) 1996 Berend de Boer <[email protected]>
  10. This program is free software; you can redistribute it and/or modify
  11. it under the terms of the GNU General Public License as published by
  12. the Free Software Foundation; either version 2 of the License, or
  13. (at your option) any later version.
  14. This program is distributed in the hope that it will be useful,
  15. but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. GNU General Public License for more details.
  18. You should have received a copy of the GNU General Public License
  19. along with this program; if not, write to the Free Software
  20. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  21. $Revision: 1.4 $
  22. $Modtime: 96-08-01 6:30 $
  23. $History: LEXRULES.PAS $
  24. *
  25. * ***************** Version 2 *****************
  26. * User: Berend Date: 96-10-10 Time: 21:16
  27. * Updated in $/Lex and Yacc/tply
  28. * Updated for protected mode, windows and Delphi 1.X and 2.X.
  29. }
  30. unit LexRules;
  31. interface
  32. uses LexBase, LexTable;
  33. procedure parse_rule ( rule_no : Integer );
  34. (* rule parser (rule_no=number of parsed rule) *)
  35. (* Return values of rule parser: *)
  36. var
  37. expr, stmt : String;
  38. (* expression and statement part of rule *)
  39. cf : Boolean;
  40. (* caret flag *)
  41. n_st : Integer;
  42. (* number of start states in prefix *)
  43. st : array [1..max_states] of Integer;
  44. (* start states *)
  45. r : RegExpr;
  46. (* parsed expression *)
  47. implementation
  48. uses LexMsgs;
  49. (* Scanner routines:
  50. The following routines provide access to the source line and handle
  51. macro substitutions. To perform macro substitution, an input buffer
  52. is maintained which contains the rest of the line to be parsed, plus
  53. any pending macro substitutions. The input buffer is organized as
  54. a stack onto which null-terminated replacement strings are pushed
  55. as macro substitutions are processed (the terminating null-character
  56. is used as an endmarker for macros, in order to keep track of the
  57. number of pending macro substitutions); characters are popped from the
  58. stack via calls to the get_char routine.
  59. In order to perform macro substitution, the scanner also has to
  60. maintain some state information to be able to determine when it
  61. is scanning quoted characters, strings or character classes (s.t.
  62. no macro substitution is performed in such cases).
  63. The scanner also keeps track of the current source line position in
  64. variable act_pos; if there are any macro substitutions on the stack,
  65. act_pos will point to the position of the original macro call in the
  66. source line. This is needed to give proper error diagnostics. *)
  67. const max_chars = 2048;
  68. var
  69. act_pos, bufptr : Integer;
  70. (* current position in source line and input stack pointer *)
  71. buf : array [1..max_chars] of Char;
  72. (* input buffer *)
  73. str_state, cclass_state, quote_state : Boolean;
  74. (* state information *)
  75. n_macros : Integer;
  76. (* number of macros currently on stack *)
  77. procedure mark_error ( msg : String; offset : Integer );
  78. (* mark error position (offset=offset of error position (to the left of
  79. act_pos) *)
  80. begin
  81. if n_macros=0 then
  82. error(msg, act_pos-offset)
  83. else
  84. error(msg+' in regular definition', act_pos)
  85. end(*mark_error*);
  86. procedure put_str(str : String);
  87. (* push str onto input stack *)
  88. var i : Integer;
  89. begin
  90. inc(bufptr, length(str));
  91. if bufptr>max_chars then fatal(macro_stack_overflow);
  92. for i := 1 to length(str) do
  93. buf[bufptr-i+1] := str[i];
  94. end(*put_str*);
  95. procedure init_scanner;
  96. (* initialize the scanner *)
  97. begin
  98. act_pos := 1; bufptr := 0;
  99. str_state := false; cclass_state := false; quote_state := false;
  100. n_macros := 0;
  101. put_str(line);
  102. end(*init_scanner*);
  103. function act_char : Char;
  104. (* current character (#0 if none) *)
  105. function push_macro : Boolean;
  106. (* check for macro call at current position in input buffer *)
  107. function scan_macro ( var name : String ) : Boolean;
  108. var i : Integer;
  109. begin
  110. if (bufptr>1) and
  111. (buf[bufptr]='{') and (buf[bufptr-1] in letters) then
  112. begin
  113. name := '{'; i := bufptr-1;
  114. while (i>0) and (buf[i] in alphanums) do
  115. begin
  116. name := name+buf[i];
  117. dec(i);
  118. end;
  119. if (i>0) and (buf[i]='}') then
  120. begin
  121. scan_macro := true;
  122. name := name+'}';
  123. bufptr := i-1;
  124. end
  125. else
  126. begin
  127. scan_macro := false;
  128. mark_error(syntax_error, -length(name));
  129. bufptr := i;
  130. end
  131. end
  132. else
  133. scan_macro := false
  134. end(*scan_macro*);
  135. var name : String;
  136. begin
  137. if scan_macro(name) then
  138. begin
  139. push_macro := true;
  140. {$ifdef fpc}
  141. with sym_table^[key(name, max_keys, @lookup, @entry)] do
  142. {$else}
  143. with sym_table^[key(name, max_keys, lookup, entry)] do
  144. {$endif}
  145. if sym_type=macro_sym then
  146. begin
  147. put_str(subst^+#0);
  148. inc(n_macros);
  149. end
  150. else
  151. mark_error(undefined_symbol, -1)
  152. end
  153. else
  154. push_macro := false
  155. end(*push_macro*);
  156. function pop_macro : Boolean;
  157. (* check for macro endmarker *)
  158. begin
  159. if (bufptr>0) and (buf[bufptr]=#0) then
  160. begin
  161. dec(bufptr);
  162. dec(n_macros);
  163. if n_macros=0 then act_pos := length(line)-bufptr+1;
  164. pop_macro := true;
  165. end
  166. else
  167. pop_macro := false
  168. end(*pop_macro*);
  169. begin
  170. if not (str_state or cclass_state or quote_state) then
  171. while push_macro do while pop_macro do ;
  172. if bufptr=0 then
  173. act_char := #0
  174. else
  175. begin
  176. while pop_macro do ;
  177. if (bufptr>0) then
  178. act_char := buf[bufptr]
  179. else
  180. act_char:=#0;
  181. end
  182. end(*act_char*);
  183. procedure get_char;
  184. (* get next character *)
  185. begin
  186. if bufptr>0 then
  187. begin
  188. case buf[bufptr] of
  189. '\' : quote_state := not quote_state;
  190. '"' : if quote_state then
  191. quote_state := false
  192. else if not cclass_state then
  193. str_state := not str_state;
  194. '[' : if quote_state then
  195. quote_state := false
  196. else if not str_state then
  197. cclass_state := true;
  198. ']' : if quote_state then
  199. quote_state := false
  200. else if not str_state then
  201. cclass_state := false;
  202. else quote_state := false;
  203. end;
  204. dec(bufptr);
  205. if n_macros=0 then
  206. act_pos := length(line)-bufptr+1;
  207. end
  208. end(*get_char*);
  209. (* Semantic routines: *)
  210. procedure add_start_state ( symbol : String );
  211. (* add start state to st array *)
  212. begin
  213. {$ifdef fpc}
  214. with sym_table^[key(symbol, max_keys, @lookup, @entry)] do
  215. {$else}
  216. with sym_table^[key(symbol, max_keys, lookup, entry)] do
  217. {$endif}
  218. if sym_type=start_state_sym then
  219. begin
  220. if n_st>=max_start_states then exit; { this shouldn't happen }
  221. inc(n_st);
  222. st[n_st] := start_state;
  223. end
  224. else
  225. mark_error(undefined_symbol, length(symbol))
  226. end(*add_start_state*);
  227. (* Parser: *)
  228. procedure parse_rule ( rule_no : Integer );
  229. procedure rule ( var done : Boolean );
  230. (* parse rule according to syntax:
  231. rule : start_state_prefix caret
  232. expr [ '$' | '/' expr ]
  233. ;
  234. start_state_prefix : /* empty */
  235. | '<' start_state_list '>'
  236. ;
  237. start_state_list : ident { ',' ident }
  238. ;
  239. caret : /* empty */
  240. | '^'
  241. ;
  242. expr : term { '|' term }
  243. ;
  244. term : factor { factor }
  245. ;
  246. factor : char
  247. | string
  248. | cclass
  249. | '.'
  250. | '(' expr ')'
  251. | factor '*'
  252. | factor '+'
  253. | factor '?'
  254. | factor '{' num [ ',' num ] '}'
  255. ;
  256. *)
  257. procedure start_state_prefix ( var done : Boolean );
  258. procedure start_state_list ( var done : Boolean );
  259. procedure ident ( var done : Boolean );
  260. var idstr : String;
  261. begin(*ident*)
  262. done := act_char in letters; if not done then exit;
  263. idstr := act_char;
  264. get_char;
  265. while act_char in alphanums do
  266. begin
  267. idstr := idstr+act_char;
  268. get_char;
  269. end;
  270. add_start_state(idstr);
  271. end(*ident*);
  272. begin(*start_state_list*)
  273. ident(done); if not done then exit;
  274. while act_char=',' do
  275. begin
  276. get_char;
  277. ident(done); if not done then exit;
  278. end;
  279. end(*start_state_list*);
  280. begin(*start_state_prefix*)
  281. n_st := 0;
  282. if act_char='<' then
  283. begin
  284. get_char;
  285. start_state_list(done); if not done then exit;
  286. if act_char='>' then
  287. begin
  288. done := true;
  289. get_char;
  290. end
  291. else
  292. done := false
  293. end
  294. else
  295. done := true
  296. end(*start_state_prefix*);
  297. procedure caret( var done : Boolean );
  298. begin(*caret*)
  299. done := true;
  300. cf := act_char='^';
  301. if act_char='^' then get_char;
  302. end(*caret*);
  303. procedure scan_char ( var done : Boolean; var c : Char );
  304. var
  305. oct_val : Byte;
  306. count : Integer;
  307. begin
  308. done := true;
  309. if act_char='\' then
  310. begin
  311. get_char;
  312. case act_char of
  313. #0 : done := false;
  314. 'n' : begin
  315. c := nl;
  316. get_char
  317. end;
  318. 'r' : begin
  319. c := cr;
  320. get_char
  321. end;
  322. 't' : begin
  323. c := tab;
  324. get_char
  325. end;
  326. 'b' : begin
  327. c := bs;
  328. get_char
  329. end;
  330. 'f' : begin
  331. c := ff;
  332. get_char
  333. end;
  334. '0'..'7' : begin
  335. oct_val := ord(act_char)-ord('0');
  336. get_char;
  337. count := 1;
  338. while ('0'<=act_char) and
  339. (act_char<='7') and
  340. (count<3) do
  341. begin
  342. inc(count);
  343. oct_val := oct_val*8+ord(act_char)-ord('0');
  344. get_char
  345. end;
  346. c := chr(oct_val);
  347. end
  348. else begin
  349. c := act_char;
  350. get_char
  351. end
  352. end
  353. end
  354. else
  355. begin
  356. c := act_char;
  357. get_char
  358. end
  359. end(*scan_char*);
  360. procedure scan_str ( var done : Boolean; var str : String );
  361. var c : Char;
  362. begin
  363. str := '';
  364. get_char;
  365. while (act_char<>#0) and (act_char<>'"') do
  366. begin
  367. scan_char(done, c); if not done then exit;
  368. str := str+c;
  369. end;
  370. if act_char=#0 then
  371. done := false
  372. else
  373. begin
  374. get_char;
  375. done := true;
  376. end
  377. end(*scan_str*);
  378. procedure scan_cclass( var done : Boolean; var cc : CClass );
  379. (* scan a character class *)
  380. var
  381. caret : boolean;
  382. c, c1,cl : Char;
  383. begin
  384. cc := [];
  385. get_char;
  386. if act_char='^' then
  387. begin
  388. caret := true;
  389. get_char;
  390. end
  391. else
  392. caret := false;
  393. while (act_char<>#0) and (act_char<>']') do
  394. begin
  395. scan_char(done, c); if not done then exit;
  396. if act_char='-' then
  397. begin
  398. get_char;
  399. if (act_char<>#0) and (act_char<>']') then
  400. begin
  401. scan_char(done, c1); if not done then exit;
  402. for cl:=c to c1 do
  403. cc:=cc+[cl];
  404. {cc := cc+[c..c1];}
  405. end
  406. else
  407. cc := cc+[c,'-'];
  408. end
  409. else
  410. cc := cc+[c];
  411. end;
  412. if act_char=#0 then
  413. done := false
  414. else
  415. begin
  416. get_char;
  417. done := true;
  418. end;
  419. if caret then cc := [#1..#255]-cc;
  420. end(*scan_cclass*);
  421. procedure scan_num( var done : Boolean; var n : Integer );
  422. var str : String;
  423. begin
  424. if act_char in digits then
  425. begin
  426. str := act_char;
  427. get_char;
  428. while act_char in digits do
  429. begin
  430. str := str+act_char;
  431. get_char;
  432. end;
  433. done := isInt(str, n);
  434. end
  435. else
  436. done := false
  437. end(*scan_num*);
  438. procedure DoExpr ( var done : Boolean; var r : RegExpr );
  439. procedure term ( var done : Boolean; var r : RegExpr );
  440. procedure factor ( var done : Boolean; var r : RegExpr );
  441. var str : String;
  442. cc : CClass;
  443. c : Char;
  444. n, m : Integer;
  445. begin(*factor*)
  446. case act_char of
  447. '"' : begin
  448. scan_str(done, str); if not done then exit;
  449. r := strExpr(newStr(str));
  450. end;
  451. '[' : begin
  452. scan_cclass(done, cc); if not done then exit;
  453. r := cclassExpr(newCClass(cc));
  454. end;
  455. '.' : begin
  456. get_char;
  457. r := cclassExpr(newCClass([#1..#255]-[nl]));
  458. done := true;
  459. end;
  460. '(' : begin
  461. get_char;
  462. DoExpr(done, r); if not done then exit;
  463. if act_char=')' then
  464. begin
  465. get_char;
  466. done := true;
  467. end
  468. else
  469. done := false
  470. end;
  471. else begin
  472. scan_char(done, c); if not done then exit;
  473. r := charExpr(c);
  474. end;
  475. end;
  476. while done and (act_char in ['*','+','?','{']) do
  477. case act_char of
  478. '*' : begin
  479. get_char;
  480. r := starExpr(r);
  481. end;
  482. '+' : begin
  483. get_char;
  484. r := plusExpr(r);
  485. end;
  486. '?' : begin
  487. get_char;
  488. r := optExpr(r);
  489. end;
  490. '{' : begin
  491. get_char;
  492. scan_num(done, m); if not done then exit;
  493. if act_char=',' then
  494. begin
  495. get_char;
  496. scan_num(done, n); if not done then exit;
  497. r := mnExpr(r, m, n);
  498. end
  499. else
  500. r := mnExpr(r, m, m);
  501. if act_char='}' then
  502. begin
  503. get_char;
  504. done := true;
  505. end
  506. else
  507. done := false
  508. end;
  509. end
  510. end(*factor*);
  511. const term_delim : CClass = [#0,' ',tab,'$','|',')','/'];
  512. var r1 : RegExpr;
  513. begin(*term*)
  514. if not (act_char in term_delim) then
  515. begin
  516. factor(done, r); if not done then exit;
  517. while not (act_char in term_delim) do
  518. begin
  519. factor(done, r1); if not done then exit;
  520. r := catExpr(r, r1);
  521. end
  522. end
  523. else
  524. begin
  525. r := epsExpr;
  526. done := true;
  527. end
  528. end(*term*);
  529. var r1 : RegExpr;
  530. begin(*expr*)
  531. term(done, r); if not done then exit;
  532. while act_char='|' do
  533. begin
  534. get_char;
  535. term(done, r1); if not done then exit;
  536. r := altExpr(r, r1);
  537. end
  538. end(*expr*);
  539. var r1, r2 : RegExpr;
  540. begin(*rule*)
  541. start_state_prefix(done); if not done then exit;
  542. caret(done); if not done then exit;
  543. DoExpr(done, r1); if not done then exit;
  544. if act_char='$' then
  545. begin
  546. r := catExpr(catExpr(r1,
  547. markExpr(rule_no, 1)),
  548. cclassExpr(newCClass([nl])));
  549. get_char;
  550. end
  551. else if act_char='/' then
  552. begin
  553. get_char;
  554. DoExpr(done, r2); if not done then exit;
  555. r := catExpr(catExpr(r1,
  556. markExpr(rule_no, 1)), r2);
  557. end
  558. else
  559. r := catExpr(r1, markExpr(rule_no, 1));
  560. r := catExpr(r, markExpr(rule_no, 0));
  561. done := (act_char=#0) or (act_char=' ') or (act_char=tab);
  562. end(*rule*);
  563. var done : Boolean;
  564. begin(*parse_rule*)
  565. init_scanner;
  566. rule(done);
  567. if done then
  568. begin
  569. expr := copy(line, 1, act_pos-1);
  570. stmt := copy(line, act_pos, length(line));
  571. end
  572. else
  573. mark_error(syntax_error, 0)
  574. end(*parse_rule*);
  575. end(*LexRules*).