regexpr.pp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. {
  2. $Id$
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000 by Florian Klaempfl
  5. This unit implements basic regular expression support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. { $define DEBUG}
  13. {
  14. TODO:
  15. - correct backtracking, for example in (...)*
  16. - | support
  17. - getting substrings and using substrings with \1 etc.
  18. - test ^ and $
  19. - newline handling in DOS?
  20. - locals dependend upper/lowercase routines
  21. - extend the interface
  22. }
  23. {$mode objfpc}
  24. unit regexpr;
  25. interface
  26. { the following declarions are only in the interface because }
  27. { some procedures return pregexprentry but programs which }
  28. { use this unit shouldn't access this data structures }
  29. type
  30. tcharset = set of char;
  31. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  32. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  33. ret_endline);
  34. pregexprentry = ^tregexprentry;
  35. tregexprentry = record
  36. next,nextdestroy : pregexprentry;
  37. case typ : tregexprentrytype of
  38. ret_charset : (chars : tcharset;
  39. elsepath : pregexprentry);
  40. ret_or : (alternative : pregexprentry);
  41. end;
  42. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  43. tregexprflags = set of tregexprflag;
  44. TRegExprEngine = record
  45. Data : pregexprentry;
  46. DestroyList : pregexprentry;
  47. Flags : TRegExprFlags;
  48. end;
  49. const
  50. cs_allchars : tcharset = [#0..#255];
  51. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  52. cs_newline : tcharset = [#10];
  53. cs_digits : tcharset = ['0'..'9'];
  54. cs_whitespace : tcharset = [' ',#9];
  55. var
  56. { these are initilized in the init section of the unit }
  57. cs_nonwordchars : tcharset;
  58. cs_nondigits : tcharset;
  59. cs_nonwhitespace : tcharset;
  60. { the following procedures can be used by units basing }
  61. { on the regexpr unit }
  62. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  63. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  64. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  65. implementation
  66. {$ifdef DEBUG}
  67. procedure writecharset(c : tcharset);
  68. var
  69. b : byte;
  70. begin
  71. for b:=0 to 255 do
  72. if chr(b) in c then
  73. write(chr(b));
  74. writeln;
  75. end;
  76. {$endif DEBUG}
  77. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  78. var
  79. first : pregexprentry;
  80. procedure doregister(p : pregexprentry);
  81. begin
  82. p^.nextdestroy:=first;
  83. if not(assigned(first)) then
  84. first:=p;
  85. end;
  86. var
  87. currentpos : pchar;
  88. error : boolean;
  89. function readchars : tcharset;
  90. var
  91. c1 : char;
  92. begin
  93. readchars:=[];
  94. case currentpos^ of
  95. #0:
  96. exit;
  97. '.':
  98. begin
  99. inc(currentpos);
  100. readchars:=cs_allchars-cs_newline;
  101. end;
  102. '\':
  103. begin
  104. inc(currentpos);
  105. case currentpos^ of
  106. #0:
  107. begin
  108. error:=true;
  109. exit;
  110. end;
  111. 't':
  112. begin
  113. inc(currentpos);
  114. readchars:=[#9];
  115. end;
  116. 'n':
  117. begin
  118. inc(currentpos);
  119. readchars:=[#10];
  120. end;
  121. 'r':
  122. begin
  123. inc(currentpos);
  124. readchars:=[#13];
  125. end;
  126. 'd':
  127. begin
  128. inc(currentpos);
  129. readchars:=cs_digits;
  130. end;
  131. 'D':
  132. begin
  133. inc(currentpos);
  134. readchars:=cs_nondigits;
  135. end;
  136. 's':
  137. begin
  138. inc(currentpos);
  139. readchars:=cs_whitespace;
  140. end;
  141. 'S':
  142. begin
  143. inc(currentpos);
  144. readchars:=cs_nonwhitespace;
  145. end;
  146. 'w':
  147. begin
  148. inc(currentpos);
  149. readchars:=cs_wordchars;
  150. end;
  151. 'W':
  152. begin
  153. inc(currentpos);
  154. readchars:=cs_nonwordchars;
  155. end;
  156. else
  157. begin
  158. error:=true;
  159. exit;
  160. end;
  161. end;
  162. end;
  163. else
  164. begin
  165. if ref_caseinsensitive in flags then
  166. c1:=upcase(currentpos^)
  167. else
  168. c1:=currentpos^;
  169. inc(currentpos);
  170. if currentpos^='-' then
  171. begin
  172. inc(currentpos);
  173. if currentpos^=#0 then
  174. begin
  175. error:=true;
  176. exit;
  177. end;
  178. if ref_caseinsensitive in flags then
  179. readchars:=[c1..upcase(currentpos^)]
  180. else
  181. readchars:=[c1..currentpos^];
  182. inc(currentpos);
  183. end
  184. else
  185. readchars:=[c1];
  186. end;
  187. end;
  188. end;
  189. function readcharset : tcharset;
  190. begin
  191. readcharset:=[];
  192. case currentpos^ of
  193. #0:
  194. exit;
  195. '[':
  196. begin
  197. inc(currentpos);
  198. while currentpos^<>']' do
  199. begin
  200. if currentpos^='^' then
  201. begin
  202. inc(currentpos);
  203. readcharset:=readcharset+(cs_allchars-readchars);
  204. end
  205. else
  206. readcharset:=readcharset+readchars;
  207. if error or (currentpos^=#0) then
  208. begin
  209. error:=true;
  210. exit;
  211. end;
  212. end;
  213. inc(currentpos);
  214. end;
  215. '^':
  216. begin
  217. inc(currentpos);
  218. readcharset:=cs_allchars-readchars;
  219. end;
  220. else
  221. readcharset:=readchars;
  222. end;
  223. end;
  224. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  225. var
  226. hp,hp2,ep : pregexprentry;
  227. cs : tcharset;
  228. chaining : ^pregexprentry;
  229. begin
  230. chaining:=nil;
  231. parseregexpr:=nil;
  232. if error then
  233. exit;
  234. { this dummy allows us to redirect the elsepath later }
  235. new(ep);
  236. doregister(ep);
  237. ep^.typ:=ret_charset;
  238. ep^.chars:=[];
  239. ep^.elsepath:=elsepath;
  240. elsepath:=ep;
  241. while true do
  242. begin
  243. if error then
  244. exit;
  245. case currentpos^ of
  246. '(':
  247. begin
  248. inc(currentpos);
  249. new(hp2);
  250. doregister(hp2);
  251. hp2^.typ:=ret_charset;
  252. hp2^.chars:=[];
  253. hp2^.elsepath:=next;
  254. hp:=parseregexpr(hp2,ep);
  255. if assigned(chaining) then
  256. chaining^:=hp
  257. else
  258. parseregexpr:=hp;
  259. chaining:=@hp2^.elsepath;
  260. if currentpos^<>')' then
  261. begin
  262. error:=true;
  263. exit;
  264. end;
  265. inc(currentpos);
  266. end;
  267. '|':
  268. begin
  269. {$ifdef DEBUG}
  270. writeln('Creating backtrace entry');
  271. {$endif DEBUG}
  272. inc(currentpos);
  273. if currentpos^=#0 then
  274. begin
  275. error:=true;
  276. exit;
  277. end;
  278. new(hp);
  279. doregister(hp);
  280. hp^.typ:=ret_backtrace;
  281. // hp^.elsepath:=parseregexpr(elsepath);
  282. hp^.next:=@parseregexpr;
  283. parseregexpr:=hp;
  284. exit;
  285. end;
  286. ')':
  287. exit;
  288. '^':
  289. begin
  290. inc(currentpos);
  291. new(hp);
  292. doregister(hp);
  293. hp^.typ:=ret_startline;
  294. hp^.elsepath:=ep;
  295. // hp^.next:=parseregexpr(ep);
  296. end;
  297. '$':
  298. begin
  299. inc(currentpos);
  300. new(hp);
  301. doregister(hp);
  302. hp^.typ:=ret_endline;
  303. hp^.elsepath:=ep;
  304. // hp^.next:=parseregexpr(ep);
  305. end;
  306. #0:
  307. exit;
  308. else
  309. begin
  310. cs:=readcharset;
  311. if error then
  312. exit;
  313. case currentpos^ of
  314. '*':
  315. begin
  316. inc(currentpos);
  317. new(hp);
  318. doregister(hp);
  319. hp^.typ:=ret_charset;
  320. hp^.chars:=cs;
  321. hp^.elsepath:=next;
  322. hp^.next:=hp;
  323. if assigned(chaining) then
  324. chaining^:=hp
  325. else
  326. parseregexpr:=hp;
  327. chaining:=@hp^.elsepath;
  328. end;
  329. '+':
  330. begin
  331. inc(currentpos);
  332. new(hp);
  333. new(hp2);
  334. doregister(hp);
  335. doregister(hp2);
  336. hp^.typ:=ret_charset;
  337. hp2^.typ:=ret_charset;
  338. hp^.chars:=cs;
  339. hp2^.chars:=cs;
  340. hp^.elsepath:=elsepath;
  341. hp^.next:=hp2;
  342. hp2^.elsepath:=next;
  343. hp2^.next:=hp2;
  344. if assigned(chaining) then
  345. chaining^:=hp
  346. else
  347. parseregexpr:=hp;
  348. chaining:=@hp2^.elsepath;
  349. end;
  350. '?':
  351. begin
  352. inc(currentpos);
  353. new(hp);
  354. { this is a dummy }
  355. new(hp2);
  356. doregister(hp);
  357. doregister(hp2);
  358. hp^.typ:=ret_charset;
  359. hp^.chars:=cs;
  360. hp^.next:=hp2;
  361. hp^.elsepath:=hp2;
  362. hp2^.typ:=ret_charset;
  363. hp2^.chars:=[];
  364. hp2^.elsepath:=next;
  365. if assigned(chaining) then
  366. chaining^:=hp
  367. else
  368. parseregexpr:=hp;
  369. chaining:=@hp2^.elsepath;
  370. end;
  371. else
  372. begin
  373. new(hp);
  374. doregister(hp);
  375. hp^.typ:=ret_charset;
  376. hp^.chars:=cs;
  377. hp^.elsepath:=elsepath;
  378. hp^.next:=next;
  379. if assigned(chaining) then
  380. chaining^:=hp
  381. else
  382. parseregexpr:=hp;
  383. chaining:=@hp^.next;
  384. end;
  385. end;
  386. end;
  387. end;
  388. end;
  389. end;
  390. var
  391. endp : pregexprentry;
  392. begin
  393. GenerateRegExprEngine.Data:=nil;
  394. GenerateRegExprEngine.DestroyList:=nil;
  395. if regexpr=nil then
  396. exit;
  397. first:=nil;
  398. if (ref_singleline in flags) and (ref_multiline in flags) then
  399. exit;
  400. currentpos:=regexpr;
  401. new(endp);
  402. doregister(endp);
  403. endp^.typ:=ret_illegalend;
  404. GenerateRegExprEngine.flags:=flags;
  405. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  406. GenerateRegExprEngine.DestroyList:=first;
  407. if error or (currentpos^<>#0) then
  408. DestroyRegExprEngine(Result);
  409. end;
  410. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  411. var
  412. hp : pregexprentry;
  413. begin
  414. hp:=regexpr.DestroyList;
  415. while assigned(hp) do
  416. begin
  417. regexpr.DestroyList:=hp^.nextdestroy;
  418. dispose(hp);
  419. hp:=regexpr.DestroyList;
  420. end;
  421. regexpr.Data:=nil;
  422. regexpr.DestroyList:=nil;
  423. end;
  424. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  425. var
  426. lastpos : pchar;
  427. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  428. begin
  429. dosearch:=false;
  430. while true do
  431. begin
  432. {$IFDEF Debug}
  433. writeln(byte(regexpr^.typ));
  434. {$ENDIF Debug}
  435. case regexpr^.typ of
  436. ret_endline:
  437. begin
  438. if ref_multiline in regexprengine.flags then
  439. begin
  440. if ((pos+1)^ in [#10,#0]) then
  441. regexpr:=regexpr^.next
  442. else
  443. regexpr:=regexpr^.elsepath;
  444. end
  445. else
  446. begin
  447. if (pos+1)^=#0 then
  448. regexpr:=regexpr^.next
  449. else
  450. regexpr:=regexpr^.elsepath;
  451. end;
  452. end;
  453. ret_startline:
  454. begin
  455. if ref_multiline in regexprengine.flags then
  456. begin
  457. if (pos=p) or ((pos-1)^=#10) then
  458. regexpr:=regexpr^.next
  459. else
  460. regexpr:=regexpr^.elsepath;
  461. end
  462. else
  463. begin
  464. if pos=p then
  465. regexpr:=regexpr^.next
  466. else
  467. regexpr:=regexpr^.elsepath;
  468. end;
  469. end;
  470. ret_charset:
  471. begin
  472. if (pos^ in regexpr^.chars) or
  473. ((ref_caseinsensitive in regexprengine.flags) and
  474. (upcase(pos^) in regexpr^.chars)) then
  475. begin
  476. {$ifdef DEBUG}
  477. writeln('Found matching: ',pos^);
  478. {$endif DEBUG}
  479. regexpr:=regexpr^.next;
  480. inc(pos);
  481. end
  482. else
  483. begin
  484. {$ifdef DEBUG}
  485. writeln('Found unmatching: ',pos^);
  486. {$endif DEBUG}
  487. regexpr:=regexpr^.elsepath;
  488. end;
  489. end;
  490. ret_backtrace:
  491. begin
  492. {$ifdef DEBUG}
  493. writeln('Starting backtrace');
  494. {$endif DEBUG}
  495. if dosearch(regexpr^.next,pos) then
  496. begin
  497. dosearch:=true;
  498. exit;
  499. end
  500. else if dosearch(regexpr^.elsepath,pos) then
  501. begin
  502. dosearch:=true;
  503. exit;
  504. end
  505. else
  506. exit;
  507. end;
  508. end;
  509. lastpos:=pos;
  510. if regexpr=nil then
  511. begin
  512. dosearch:=true;
  513. exit;
  514. end;
  515. if regexpr^.typ=ret_illegalend then
  516. exit;
  517. if pos^=#0 then
  518. exit;
  519. end;
  520. end;
  521. begin
  522. RegExprPos:=false;
  523. index:=0;
  524. len:=0;
  525. if regexprengine.Data=nil then
  526. exit;
  527. while p^<>#0 do
  528. begin
  529. if dosearch(regexprengine.Data,p) then
  530. begin
  531. len:=lastpos-p;
  532. RegExprPos:=true;
  533. exit;
  534. end
  535. else
  536. begin
  537. inc(p);
  538. inc(index);
  539. end;
  540. end;
  541. index:=-1;
  542. end;
  543. begin
  544. cs_nonwordchars:=cs_allchars-cs_wordchars;
  545. cs_nondigits:=cs_allchars-cs_digits;
  546. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  547. end.
  548. {
  549. $Log$
  550. Revision 1.1 2002-01-29 17:54:56 peter
  551. * splitted to base and extra
  552. Revision 1.4 2002/01/22 13:13:14 pierre
  553. + add mode objfpc to avoid problems if compiling from IDE dir
  554. Revision 1.3 2000/07/30 14:58:04 sg
  555. * Added modifications by Markus Kaemmerer:
  556. - Unit now compiles with Delphi
  557. - Removed debug output when not compiled with -dDEBUG
  558. Revision 1.2 2000/07/13 11:33:31 michael
  559. + removed logs
  560. }