regexpr.pp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. {
  2. $Id$
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000 by Florian Klaempfl
  5. This unit implements basic regular expression support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. { $define DEBUG}
  13. {
  14. TODO:
  15. - correct backtracking, for example in (...)*
  16. - | support
  17. - getting substrings and using substrings with \1 etc.
  18. - test ^ and $
  19. - newline handling in DOS?
  20. - locals dependend upper/lowercase routines
  21. - extend the interface
  22. }
  23. unit regexpr;
  24. interface
  25. { the following declarions are only in the interface because }
  26. { some procedures return pregexprentry but programs which }
  27. { use this unit shouldn't access this data structures }
  28. type
  29. tcharset = set of char;
  30. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  31. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  32. ret_endline);
  33. pregexprentry = ^tregexprentry;
  34. tregexprentry = record
  35. next,nextdestroy : pregexprentry;
  36. case typ : tregexprentrytype of
  37. ret_charset : (chars : tcharset;
  38. elsepath : pregexprentry);
  39. ret_or : (alternative : pregexprentry);
  40. end;
  41. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  42. tregexprflags = set of tregexprflag;
  43. TRegExprEngine = record
  44. Data : pregexprentry;
  45. DestroyList : pregexprentry;
  46. Flags : TRegExprFlags;
  47. end;
  48. const
  49. cs_allchars : tcharset = [#0..#255];
  50. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  51. cs_newline : tcharset = [#10];
  52. cs_digits : tcharset = ['0'..'9'];
  53. cs_whitespace : tcharset = [' ',#9];
  54. var
  55. { these are initilized in the init section of the unit }
  56. cs_nonwordchars : tcharset;
  57. cs_nondigits : tcharset;
  58. cs_nonwhitespace : tcharset;
  59. { the following procedures can be used by units basing }
  60. { on the regexpr unit }
  61. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  62. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  63. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  64. implementation
  65. uses
  66. strings;
  67. {$ifdef DEBUG}
  68. procedure writecharset(c : tcharset);
  69. var
  70. b : byte;
  71. begin
  72. for b:=0 to 255 do
  73. if chr(b) in c then
  74. write(chr(b));
  75. writeln;
  76. end;
  77. {$endif DEBUG}
  78. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  79. var
  80. first : pregexprentry;
  81. procedure doregister(p : pregexprentry);
  82. begin
  83. p^.nextdestroy:=first;
  84. if not(assigned(first)) then
  85. first:=p;
  86. end;
  87. var
  88. currentpos : pchar;
  89. error : boolean;
  90. function readchars : tcharset;
  91. var
  92. c1 : char;
  93. begin
  94. readchars:=[];
  95. case currentpos^ of
  96. #0:
  97. exit;
  98. '.':
  99. begin
  100. inc(currentpos);
  101. readchars:=cs_allchars-cs_newline;
  102. end;
  103. '\':
  104. begin
  105. inc(currentpos);
  106. case currentpos^ of
  107. #0:
  108. begin
  109. error:=true;
  110. exit;
  111. end;
  112. 't':
  113. begin
  114. inc(currentpos);
  115. readchars:=[#9];
  116. end;
  117. 'n':
  118. begin
  119. inc(currentpos);
  120. readchars:=[#10];
  121. end;
  122. 'r':
  123. begin
  124. inc(currentpos);
  125. readchars:=[#13];
  126. end;
  127. 'd':
  128. begin
  129. inc(currentpos);
  130. readchars:=cs_digits;
  131. end;
  132. 'D':
  133. begin
  134. inc(currentpos);
  135. readchars:=cs_nondigits;
  136. end;
  137. 's':
  138. begin
  139. inc(currentpos);
  140. readchars:=cs_whitespace;
  141. end;
  142. 'S':
  143. begin
  144. inc(currentpos);
  145. readchars:=cs_nonwhitespace;
  146. end;
  147. 'w':
  148. begin
  149. inc(currentpos);
  150. readchars:=cs_wordchars;
  151. end;
  152. 'W':
  153. begin
  154. inc(currentpos);
  155. readchars:=cs_nonwordchars;
  156. end;
  157. else
  158. begin
  159. error:=true;
  160. exit;
  161. end;
  162. end;
  163. end;
  164. else
  165. begin
  166. if ref_caseinsensitive in flags then
  167. c1:=upcase(currentpos^)
  168. else
  169. c1:=currentpos^;
  170. inc(currentpos);
  171. if currentpos^='-' then
  172. begin
  173. inc(currentpos);
  174. if currentpos^=#0 then
  175. begin
  176. error:=true;
  177. exit;
  178. end;
  179. if ref_caseinsensitive in flags then
  180. readchars:=[c1..upcase(currentpos^)]
  181. else
  182. readchars:=[c1..currentpos^];
  183. inc(currentpos);
  184. end
  185. else
  186. readchars:=[c1];
  187. end;
  188. end;
  189. end;
  190. function readcharset : tcharset;
  191. begin
  192. readcharset:=[];
  193. case currentpos^ of
  194. #0:
  195. exit;
  196. '[':
  197. begin
  198. inc(currentpos);
  199. while currentpos^<>']' do
  200. begin
  201. if currentpos^='^' then
  202. begin
  203. inc(currentpos);
  204. readcharset:=readcharset+(cs_allchars-readchars);
  205. end
  206. else
  207. readcharset:=readcharset+readchars;
  208. if error or (currentpos^=#0) then
  209. begin
  210. error:=true;
  211. exit;
  212. end;
  213. end;
  214. inc(currentpos);
  215. end;
  216. '^':
  217. begin
  218. inc(currentpos);
  219. readcharset:=cs_allchars-readchars;
  220. end;
  221. else
  222. readcharset:=readchars;
  223. end;
  224. end;
  225. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  226. var
  227. hp,hp2,ep : pregexprentry;
  228. cs : tcharset;
  229. chaining : ^pregexprentry;
  230. begin
  231. chaining:=nil;
  232. parseregexpr:=nil;
  233. if error then
  234. exit;
  235. { this dummy allows us to redirect the elsepath later }
  236. new(ep);
  237. doregister(ep);
  238. ep^.typ:=ret_charset;
  239. ep^.chars:=[];
  240. ep^.elsepath:=elsepath;
  241. elsepath:=ep;
  242. while true do
  243. begin
  244. if error then
  245. exit;
  246. case currentpos^ of
  247. '(':
  248. begin
  249. inc(currentpos);
  250. new(hp2);
  251. doregister(hp2);
  252. hp2^.typ:=ret_charset;
  253. hp2^.chars:=[];
  254. hp2^.elsepath:=next;
  255. hp:=parseregexpr(hp2,ep);
  256. if assigned(chaining) then
  257. chaining^:=hp
  258. else
  259. parseregexpr:=hp;
  260. chaining:=@hp2^.elsepath;
  261. if currentpos^<>')' then
  262. begin
  263. error:=true;
  264. exit;
  265. end;
  266. inc(currentpos);
  267. end;
  268. '|':
  269. begin
  270. {$ifdef DEBUG}
  271. writeln('Creating backtrace entry');
  272. {$endif DEBUG}
  273. inc(currentpos);
  274. if currentpos^=#0 then
  275. begin
  276. error:=true;
  277. exit;
  278. end;
  279. new(hp);
  280. doregister(hp);
  281. hp^.typ:=ret_backtrace;
  282. // hp^.elsepath:=parseregexpr(elsepath);
  283. hp^.next:=parseregexpr;
  284. parseregexpr:=hp;
  285. exit;
  286. end;
  287. ')':
  288. exit;
  289. '^':
  290. begin
  291. inc(currentpos);
  292. new(hp);
  293. doregister(hp);
  294. hp^.typ:=ret_startline;
  295. hp^.elsepath:=ep;
  296. // hp^.next:=parseregexpr(ep);
  297. end;
  298. '$':
  299. begin
  300. inc(currentpos);
  301. new(hp);
  302. doregister(hp);
  303. hp^.typ:=ret_endline;
  304. hp^.elsepath:=ep;
  305. // hp^.next:=parseregexpr(ep);
  306. end;
  307. #0:
  308. exit;
  309. else
  310. begin
  311. cs:=readcharset;
  312. if error then
  313. exit;
  314. case currentpos^ of
  315. '*':
  316. begin
  317. inc(currentpos);
  318. new(hp);
  319. doregister(hp);
  320. hp^.typ:=ret_charset;
  321. hp^.chars:=cs;
  322. hp^.elsepath:=next;
  323. hp^.next:=hp;
  324. if assigned(chaining) then
  325. chaining^:=hp
  326. else
  327. parseregexpr:=hp;
  328. chaining:=@hp^.elsepath;
  329. end;
  330. '+':
  331. begin
  332. inc(currentpos);
  333. new(hp);
  334. new(hp2);
  335. doregister(hp);
  336. doregister(hp2);
  337. hp^.typ:=ret_charset;
  338. hp2^.typ:=ret_charset;
  339. hp^.chars:=cs;
  340. hp2^.chars:=cs;
  341. hp^.elsepath:=elsepath;
  342. hp^.next:=hp2;
  343. hp2^.elsepath:=next;
  344. hp2^.next:=hp2;
  345. if assigned(chaining) then
  346. chaining^:=hp
  347. else
  348. parseregexpr:=hp;
  349. chaining:=@hp2^.elsepath;
  350. end;
  351. '?':
  352. begin
  353. inc(currentpos);
  354. new(hp);
  355. { this is a dummy }
  356. new(hp2);
  357. doregister(hp);
  358. doregister(hp2);
  359. hp^.typ:=ret_charset;
  360. hp^.chars:=cs;
  361. hp^.next:=hp2;
  362. hp^.elsepath:=hp2;
  363. hp2^.typ:=ret_charset;
  364. hp2^.chars:=[];
  365. hp2^.elsepath:=next;
  366. if assigned(chaining) then
  367. chaining^:=hp
  368. else
  369. parseregexpr:=hp;
  370. chaining:=@hp2^.elsepath;
  371. end;
  372. else
  373. begin
  374. new(hp);
  375. doregister(hp);
  376. hp^.typ:=ret_charset;
  377. hp^.chars:=cs;
  378. hp^.elsepath:=elsepath;
  379. hp^.next:=next;
  380. if assigned(chaining) then
  381. chaining^:=hp
  382. else
  383. parseregexpr:=hp;
  384. chaining:=@hp^.next;
  385. end;
  386. end;
  387. end;
  388. end;
  389. end;
  390. end;
  391. var
  392. endp : pregexprentry;
  393. begin
  394. GenerateRegExprEngine.Data:=nil;
  395. GenerateRegExprEngine.DestroyList:=nil;
  396. if regexpr=nil then
  397. exit;
  398. first:=nil;
  399. if (ref_singleline in flags) and (ref_multiline in flags) then
  400. exit;
  401. currentpos:=regexpr;
  402. new(endp);
  403. doregister(endp);
  404. endp^.typ:=ret_illegalend;
  405. GenerateRegExprEngine.flags:=flags;
  406. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  407. GenerateRegExprEngine.DestroyList:=first;
  408. if error or (currentpos^<>#0) then
  409. DestroyRegExprEngine(GenerateRegExprEngine);
  410. end;
  411. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  412. var
  413. hp : pregexprentry;
  414. begin
  415. hp:=regexpr.DestroyList;
  416. while assigned(hp) do
  417. begin
  418. regexpr.DestroyList:=hp^.nextdestroy;
  419. dispose(hp);
  420. hp:=regexpr.DestroyList;
  421. end;
  422. regexpr.Data:=nil;
  423. regexpr.DestroyList:=nil;
  424. end;
  425. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  426. var
  427. lastpos : pchar;
  428. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  429. begin
  430. dosearch:=false;
  431. while true do
  432. begin
  433. writeln(byte(regexpr^.typ));
  434. case regexpr^.typ of
  435. ret_endline:
  436. begin
  437. if ref_multiline in regexprengine.flags then
  438. begin
  439. if ((pos+1)^ in [#10,#0]) then
  440. regexpr:=regexpr^.next
  441. else
  442. regexpr:=regexpr^.elsepath;
  443. end
  444. else
  445. begin
  446. if (pos+1)^=#0 then
  447. regexpr:=regexpr^.next
  448. else
  449. regexpr:=regexpr^.elsepath;
  450. end;
  451. end;
  452. ret_startline:
  453. begin
  454. if ref_multiline in regexprengine.flags then
  455. begin
  456. if (pos=p) or ((pos-1)^=#10) then
  457. regexpr:=regexpr^.next
  458. else
  459. regexpr:=regexpr^.elsepath;
  460. end
  461. else
  462. begin
  463. if pos=p then
  464. regexpr:=regexpr^.next
  465. else
  466. regexpr:=regexpr^.elsepath;
  467. end;
  468. end;
  469. ret_charset:
  470. begin
  471. if (pos^ in regexpr^.chars) or
  472. ((ref_caseinsensitive in regexprengine.flags) and
  473. (upcase(pos^) in regexpr^.chars)) then
  474. begin
  475. {$ifdef DEBUG}
  476. writeln('Found matching: ',pos^);
  477. {$endif DEBUG}
  478. regexpr:=regexpr^.next;
  479. inc(pos);
  480. end
  481. else
  482. begin
  483. {$ifdef DEBUG}
  484. writeln('Found unmatching: ',pos^);
  485. {$endif DEBUG}
  486. regexpr:=regexpr^.elsepath;
  487. end;
  488. end;
  489. ret_backtrace:
  490. begin
  491. {$ifdef DEBUG}
  492. writeln('Starting backtrace');
  493. {$endif DEBUG}
  494. if dosearch(regexpr^.next,pos) then
  495. begin
  496. dosearch:=true;
  497. exit;
  498. end
  499. else if dosearch(regexpr^.elsepath,pos) then
  500. begin
  501. dosearch:=true;
  502. exit;
  503. end
  504. else
  505. exit;
  506. end;
  507. end;
  508. lastpos:=pos;
  509. if regexpr=nil then
  510. begin
  511. dosearch:=true;
  512. exit;
  513. end;
  514. if regexpr^.typ=ret_illegalend then
  515. exit;
  516. if pos^=#0 then
  517. exit;
  518. end;
  519. end;
  520. begin
  521. RegExprPos:=false;
  522. index:=0;
  523. len:=0;
  524. if regexprengine.Data=nil then
  525. exit;
  526. while p^<>#0 do
  527. begin
  528. if dosearch(regexprengine.Data,p) then
  529. begin
  530. len:=lastpos-p;
  531. RegExprPos:=true;
  532. exit;
  533. end
  534. else
  535. begin
  536. inc(p);
  537. inc(index);
  538. end;
  539. end;
  540. index:=-1;
  541. end;
  542. begin
  543. cs_nonwordchars:=cs_allchars-cs_wordchars;
  544. cs_nondigits:=cs_allchars-cs_digits;
  545. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  546. end.
  547. {
  548. $Log$
  549. Revision 1.4 2000-04-08 09:31:59 peter
  550. * makefiles added
  551. * removed notes
  552. Revision 1.3 2000/03/19 16:20:44 florian
  553. * some improvements
  554. Revision 1.2 2000/03/14 22:57:51 florian
  555. + added flags
  556. + support of case insensitive search
  557. Revision 1.1 2000/03/14 22:09:03 florian
  558. * Initial revision
  559. }