regexpr.pp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. {
  2. $Id$
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000 by Florian Klaempfl
  5. This unit implements basic regular expression support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. { $define DEBUG}
  13. {
  14. TODO:
  15. - correct backtracking, for example in (...)*
  16. - | support
  17. - getting substrings and using substrings with \1 etc.
  18. - test ^ and $
  19. - newline handling in DOS?
  20. - locals dependend upper/lowercase routines
  21. - extend the interface
  22. }
  23. unit regexpr;
  24. interface
  25. { the following declarions are only in the interface because }
  26. { some procedures return pregexprentry but programs which }
  27. { use this unit shouldn't access this data structures }
  28. type
  29. tcharset = set of char;
  30. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  31. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  32. ret_endline);
  33. pregexprentry = ^tregexprentry;
  34. tregexprentry = record
  35. next,nextdestroy : pregexprentry;
  36. case typ : tregexprentrytype of
  37. ret_charset : (chars : tcharset;
  38. elsepath : pregexprentry);
  39. ret_or : (alternative : pregexprentry);
  40. end;
  41. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  42. tregexprflags = set of tregexprflag;
  43. TRegExprEngine = record
  44. Data : pregexprentry;
  45. DestroyList : pregexprentry;
  46. Flags : TRegExprFlags;
  47. end;
  48. const
  49. cs_allchars : tcharset = [#0..#255];
  50. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  51. cs_newline : tcharset = [#10];
  52. cs_digits : tcharset = ['0'..'9'];
  53. cs_whitespace : tcharset = [' ',#9];
  54. var
  55. { these are initilized in the init section of the unit }
  56. cs_nonwordchars : tcharset;
  57. cs_nondigits : tcharset;
  58. cs_nonwhitespace : tcharset;
  59. { the following procedures can be used by units basing }
  60. { on the regexpr unit }
  61. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  62. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  63. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  64. implementation
  65. {$ifdef DEBUG}
  66. procedure writecharset(c : tcharset);
  67. var
  68. b : byte;
  69. begin
  70. for b:=0 to 255 do
  71. if chr(b) in c then
  72. write(chr(b));
  73. writeln;
  74. end;
  75. {$endif DEBUG}
  76. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  77. var
  78. first : pregexprentry;
  79. procedure doregister(p : pregexprentry);
  80. begin
  81. p^.nextdestroy:=first;
  82. if not(assigned(first)) then
  83. first:=p;
  84. end;
  85. var
  86. currentpos : pchar;
  87. error : boolean;
  88. function readchars : tcharset;
  89. var
  90. c1 : char;
  91. begin
  92. readchars:=[];
  93. case currentpos^ of
  94. #0:
  95. exit;
  96. '.':
  97. begin
  98. inc(currentpos);
  99. readchars:=cs_allchars-cs_newline;
  100. end;
  101. '\':
  102. begin
  103. inc(currentpos);
  104. case currentpos^ of
  105. #0:
  106. begin
  107. error:=true;
  108. exit;
  109. end;
  110. 't':
  111. begin
  112. inc(currentpos);
  113. readchars:=[#9];
  114. end;
  115. 'n':
  116. begin
  117. inc(currentpos);
  118. readchars:=[#10];
  119. end;
  120. 'r':
  121. begin
  122. inc(currentpos);
  123. readchars:=[#13];
  124. end;
  125. 'd':
  126. begin
  127. inc(currentpos);
  128. readchars:=cs_digits;
  129. end;
  130. 'D':
  131. begin
  132. inc(currentpos);
  133. readchars:=cs_nondigits;
  134. end;
  135. 's':
  136. begin
  137. inc(currentpos);
  138. readchars:=cs_whitespace;
  139. end;
  140. 'S':
  141. begin
  142. inc(currentpos);
  143. readchars:=cs_nonwhitespace;
  144. end;
  145. 'w':
  146. begin
  147. inc(currentpos);
  148. readchars:=cs_wordchars;
  149. end;
  150. 'W':
  151. begin
  152. inc(currentpos);
  153. readchars:=cs_nonwordchars;
  154. end;
  155. else
  156. begin
  157. error:=true;
  158. exit;
  159. end;
  160. end;
  161. end;
  162. else
  163. begin
  164. if ref_caseinsensitive in flags then
  165. c1:=upcase(currentpos^)
  166. else
  167. c1:=currentpos^;
  168. inc(currentpos);
  169. if currentpos^='-' then
  170. begin
  171. inc(currentpos);
  172. if currentpos^=#0 then
  173. begin
  174. error:=true;
  175. exit;
  176. end;
  177. if ref_caseinsensitive in flags then
  178. readchars:=[c1..upcase(currentpos^)]
  179. else
  180. readchars:=[c1..currentpos^];
  181. inc(currentpos);
  182. end
  183. else
  184. readchars:=[c1];
  185. end;
  186. end;
  187. end;
  188. function readcharset : tcharset;
  189. begin
  190. readcharset:=[];
  191. case currentpos^ of
  192. #0:
  193. exit;
  194. '[':
  195. begin
  196. inc(currentpos);
  197. while currentpos^<>']' do
  198. begin
  199. if currentpos^='^' then
  200. begin
  201. inc(currentpos);
  202. readcharset:=readcharset+(cs_allchars-readchars);
  203. end
  204. else
  205. readcharset:=readcharset+readchars;
  206. if error or (currentpos^=#0) then
  207. begin
  208. error:=true;
  209. exit;
  210. end;
  211. end;
  212. inc(currentpos);
  213. end;
  214. '^':
  215. begin
  216. inc(currentpos);
  217. readcharset:=cs_allchars-readchars;
  218. end;
  219. else
  220. readcharset:=readchars;
  221. end;
  222. end;
  223. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  224. var
  225. hp,hp2,ep : pregexprentry;
  226. cs : tcharset;
  227. chaining : ^pregexprentry;
  228. begin
  229. chaining:=nil;
  230. parseregexpr:=nil;
  231. if error then
  232. exit;
  233. { this dummy allows us to redirect the elsepath later }
  234. new(ep);
  235. doregister(ep);
  236. ep^.typ:=ret_charset;
  237. ep^.chars:=[];
  238. ep^.elsepath:=elsepath;
  239. elsepath:=ep;
  240. while true do
  241. begin
  242. if error then
  243. exit;
  244. case currentpos^ of
  245. '(':
  246. begin
  247. inc(currentpos);
  248. new(hp2);
  249. doregister(hp2);
  250. hp2^.typ:=ret_charset;
  251. hp2^.chars:=[];
  252. hp2^.elsepath:=next;
  253. hp:=parseregexpr(hp2,ep);
  254. if assigned(chaining) then
  255. chaining^:=hp
  256. else
  257. parseregexpr:=hp;
  258. chaining:=@hp2^.elsepath;
  259. if currentpos^<>')' then
  260. begin
  261. error:=true;
  262. exit;
  263. end;
  264. inc(currentpos);
  265. end;
  266. '|':
  267. begin
  268. {$ifdef DEBUG}
  269. writeln('Creating backtrace entry');
  270. {$endif DEBUG}
  271. inc(currentpos);
  272. if currentpos^=#0 then
  273. begin
  274. error:=true;
  275. exit;
  276. end;
  277. new(hp);
  278. doregister(hp);
  279. hp^.typ:=ret_backtrace;
  280. // hp^.elsepath:=parseregexpr(elsepath);
  281. hp^.next:=@parseregexpr;
  282. parseregexpr:=hp;
  283. exit;
  284. end;
  285. ')':
  286. exit;
  287. '^':
  288. begin
  289. inc(currentpos);
  290. new(hp);
  291. doregister(hp);
  292. hp^.typ:=ret_startline;
  293. hp^.elsepath:=ep;
  294. // hp^.next:=parseregexpr(ep);
  295. end;
  296. '$':
  297. begin
  298. inc(currentpos);
  299. new(hp);
  300. doregister(hp);
  301. hp^.typ:=ret_endline;
  302. hp^.elsepath:=ep;
  303. // hp^.next:=parseregexpr(ep);
  304. end;
  305. #0:
  306. exit;
  307. else
  308. begin
  309. cs:=readcharset;
  310. if error then
  311. exit;
  312. case currentpos^ of
  313. '*':
  314. begin
  315. inc(currentpos);
  316. new(hp);
  317. doregister(hp);
  318. hp^.typ:=ret_charset;
  319. hp^.chars:=cs;
  320. hp^.elsepath:=next;
  321. hp^.next:=hp;
  322. if assigned(chaining) then
  323. chaining^:=hp
  324. else
  325. parseregexpr:=hp;
  326. chaining:=@hp^.elsepath;
  327. end;
  328. '+':
  329. begin
  330. inc(currentpos);
  331. new(hp);
  332. new(hp2);
  333. doregister(hp);
  334. doregister(hp2);
  335. hp^.typ:=ret_charset;
  336. hp2^.typ:=ret_charset;
  337. hp^.chars:=cs;
  338. hp2^.chars:=cs;
  339. hp^.elsepath:=elsepath;
  340. hp^.next:=hp2;
  341. hp2^.elsepath:=next;
  342. hp2^.next:=hp2;
  343. if assigned(chaining) then
  344. chaining^:=hp
  345. else
  346. parseregexpr:=hp;
  347. chaining:=@hp2^.elsepath;
  348. end;
  349. '?':
  350. begin
  351. inc(currentpos);
  352. new(hp);
  353. { this is a dummy }
  354. new(hp2);
  355. doregister(hp);
  356. doregister(hp2);
  357. hp^.typ:=ret_charset;
  358. hp^.chars:=cs;
  359. hp^.next:=hp2;
  360. hp^.elsepath:=hp2;
  361. hp2^.typ:=ret_charset;
  362. hp2^.chars:=[];
  363. hp2^.elsepath:=next;
  364. if assigned(chaining) then
  365. chaining^:=hp
  366. else
  367. parseregexpr:=hp;
  368. chaining:=@hp2^.elsepath;
  369. end;
  370. else
  371. begin
  372. new(hp);
  373. doregister(hp);
  374. hp^.typ:=ret_charset;
  375. hp^.chars:=cs;
  376. hp^.elsepath:=elsepath;
  377. hp^.next:=next;
  378. if assigned(chaining) then
  379. chaining^:=hp
  380. else
  381. parseregexpr:=hp;
  382. chaining:=@hp^.next;
  383. end;
  384. end;
  385. end;
  386. end;
  387. end;
  388. end;
  389. var
  390. endp : pregexprentry;
  391. begin
  392. GenerateRegExprEngine.Data:=nil;
  393. GenerateRegExprEngine.DestroyList:=nil;
  394. if regexpr=nil then
  395. exit;
  396. first:=nil;
  397. if (ref_singleline in flags) and (ref_multiline in flags) then
  398. exit;
  399. currentpos:=regexpr;
  400. new(endp);
  401. doregister(endp);
  402. endp^.typ:=ret_illegalend;
  403. GenerateRegExprEngine.flags:=flags;
  404. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  405. GenerateRegExprEngine.DestroyList:=first;
  406. if error or (currentpos^<>#0) then
  407. DestroyRegExprEngine(Result);
  408. end;
  409. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  410. var
  411. hp : pregexprentry;
  412. begin
  413. hp:=regexpr.DestroyList;
  414. while assigned(hp) do
  415. begin
  416. regexpr.DestroyList:=hp^.nextdestroy;
  417. dispose(hp);
  418. hp:=regexpr.DestroyList;
  419. end;
  420. regexpr.Data:=nil;
  421. regexpr.DestroyList:=nil;
  422. end;
  423. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  424. var
  425. lastpos : pchar;
  426. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  427. begin
  428. dosearch:=false;
  429. while true do
  430. begin
  431. {$IFDEF Debug}
  432. writeln(byte(regexpr^.typ));
  433. {$ENDIF Debug}
  434. case regexpr^.typ of
  435. ret_endline:
  436. begin
  437. if ref_multiline in regexprengine.flags then
  438. begin
  439. if ((pos+1)^ in [#10,#0]) then
  440. regexpr:=regexpr^.next
  441. else
  442. regexpr:=regexpr^.elsepath;
  443. end
  444. else
  445. begin
  446. if (pos+1)^=#0 then
  447. regexpr:=regexpr^.next
  448. else
  449. regexpr:=regexpr^.elsepath;
  450. end;
  451. end;
  452. ret_startline:
  453. begin
  454. if ref_multiline in regexprengine.flags then
  455. begin
  456. if (pos=p) or ((pos-1)^=#10) then
  457. regexpr:=regexpr^.next
  458. else
  459. regexpr:=regexpr^.elsepath;
  460. end
  461. else
  462. begin
  463. if pos=p then
  464. regexpr:=regexpr^.next
  465. else
  466. regexpr:=regexpr^.elsepath;
  467. end;
  468. end;
  469. ret_charset:
  470. begin
  471. if (pos^ in regexpr^.chars) or
  472. ((ref_caseinsensitive in regexprengine.flags) and
  473. (upcase(pos^) in regexpr^.chars)) then
  474. begin
  475. {$ifdef DEBUG}
  476. writeln('Found matching: ',pos^);
  477. {$endif DEBUG}
  478. regexpr:=regexpr^.next;
  479. inc(pos);
  480. end
  481. else
  482. begin
  483. {$ifdef DEBUG}
  484. writeln('Found unmatching: ',pos^);
  485. {$endif DEBUG}
  486. regexpr:=regexpr^.elsepath;
  487. end;
  488. end;
  489. ret_backtrace:
  490. begin
  491. {$ifdef DEBUG}
  492. writeln('Starting backtrace');
  493. {$endif DEBUG}
  494. if dosearch(regexpr^.next,pos) then
  495. begin
  496. dosearch:=true;
  497. exit;
  498. end
  499. else if dosearch(regexpr^.elsepath,pos) then
  500. begin
  501. dosearch:=true;
  502. exit;
  503. end
  504. else
  505. exit;
  506. end;
  507. end;
  508. lastpos:=pos;
  509. if regexpr=nil then
  510. begin
  511. dosearch:=true;
  512. exit;
  513. end;
  514. if regexpr^.typ=ret_illegalend then
  515. exit;
  516. if pos^=#0 then
  517. exit;
  518. end;
  519. end;
  520. begin
  521. RegExprPos:=false;
  522. index:=0;
  523. len:=0;
  524. if regexprengine.Data=nil then
  525. exit;
  526. while p^<>#0 do
  527. begin
  528. if dosearch(regexprengine.Data,p) then
  529. begin
  530. len:=lastpos-p;
  531. RegExprPos:=true;
  532. exit;
  533. end
  534. else
  535. begin
  536. inc(p);
  537. inc(index);
  538. end;
  539. end;
  540. index:=-1;
  541. end;
  542. begin
  543. cs_nonwordchars:=cs_allchars-cs_wordchars;
  544. cs_nondigits:=cs_allchars-cs_digits;
  545. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  546. end.
  547. {
  548. $Log$
  549. Revision 1.3 2000-07-30 14:58:04 sg
  550. * Added modifications by Markus Kaemmerer:
  551. - Unit now compiles with Delphi
  552. - Removed debug output when not compiled with -dDEBUG
  553. Revision 1.2 2000/07/13 11:33:31 michael
  554. + removed logs
  555. }