regexpr.pp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. {
  2. This unit implements basic regular expression support
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000-2005 by Florian Klaempfl
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. { $define DEBUG}
  12. (*
  13. TODO:
  14. - correct backtracking, for example in (...)*
  15. - | support
  16. - getting substrings and using substrings with \1 etc.
  17. - test ^ and $
  18. - newline handling in DOS?
  19. - locals dependend upper/lowercase routines
  20. - extend the interface
  21. - support for number of matches:
  22. {n} Match exactly n times
  23. {n,} Match at least n times
  24. {n,m} Match at least n but not more than m times
  25. *)
  26. {$mode objfpc}
  27. unit regexpr;
  28. interface
  29. { the following declarions are only in the interface because }
  30. { some procedures return pregexprentry but programs which }
  31. { use this unit shouldn't access this data structures }
  32. type
  33. tcharset = set of char;
  34. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  35. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  36. ret_endline);
  37. pregexprentry = ^tregexprentry;
  38. tregexprentry = record
  39. next,nextdestroy : pregexprentry;
  40. case typ : tregexprentrytype of
  41. ret_charset : (chars : tcharset;
  42. elsepath : pregexprentry);
  43. ret_or : (alternative : pregexprentry);
  44. end;
  45. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  46. tregexprflags = set of tregexprflag;
  47. TRegExprEngine = record
  48. Data : pregexprentry;
  49. DestroyList : pregexprentry;
  50. Flags : TRegExprFlags;
  51. end;
  52. const
  53. cs_allchars : tcharset = [#0..#255];
  54. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  55. cs_newline : tcharset = [#10];
  56. cs_digits : tcharset = ['0'..'9'];
  57. cs_whitespace : tcharset = [' ',#9];
  58. var
  59. { these are initilized in the init section of the unit }
  60. cs_nonwordchars : tcharset;
  61. cs_nondigits : tcharset;
  62. cs_nonwhitespace : tcharset;
  63. { the following procedures can be used by units basing }
  64. { on the regexpr unit }
  65. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  66. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  67. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  68. { This function Escape known regex chars and place the result on Return. If something went wrong the function will return false. }
  69. function RegExprEscapeStr (const S : AnsiString) : AnsiString;
  70. implementation
  71. {$ifdef DEBUG}
  72. procedure writecharset(c : tcharset);
  73. var
  74. b : byte;
  75. begin
  76. for b:=0 to 255 do
  77. if chr(b) in c then
  78. write(chr(b));
  79. writeln;
  80. end;
  81. {$endif DEBUG}
  82. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  83. var
  84. first : pregexprentry;
  85. procedure doregister(p : pregexprentry);
  86. begin
  87. p^.nextdestroy:=first;
  88. first:=p;
  89. end;
  90. var
  91. currentpos : pchar;
  92. error : boolean;
  93. function readchars : tcharset;
  94. var
  95. c1 : char;
  96. begin
  97. readchars:=[];
  98. case currentpos^ of
  99. #0:
  100. exit;
  101. '.':
  102. begin
  103. inc(currentpos);
  104. readchars:=cs_allchars-cs_newline;
  105. end;
  106. '\':
  107. begin
  108. inc(currentpos);
  109. case currentpos^ of
  110. #0:
  111. begin
  112. error:=true;
  113. exit;
  114. end;
  115. 't':
  116. begin
  117. inc(currentpos);
  118. readchars:=[#9];
  119. end;
  120. 'n':
  121. begin
  122. inc(currentpos);
  123. readchars:=[#10];
  124. end;
  125. 'r':
  126. begin
  127. inc(currentpos);
  128. readchars:=[#13];
  129. end;
  130. 'd':
  131. begin
  132. inc(currentpos);
  133. readchars:=cs_digits;
  134. end;
  135. 'D':
  136. begin
  137. inc(currentpos);
  138. readchars:=cs_nondigits;
  139. end;
  140. 's':
  141. begin
  142. inc(currentpos);
  143. readchars:=cs_whitespace;
  144. end;
  145. 'S':
  146. begin
  147. inc(currentpos);
  148. readchars:=cs_nonwhitespace;
  149. end;
  150. 'w':
  151. begin
  152. inc(currentpos);
  153. readchars:=cs_wordchars;
  154. end;
  155. 'W':
  156. begin
  157. inc(currentpos);
  158. readchars:=cs_nonwordchars;
  159. end;
  160. 'f' :
  161. begin
  162. inc(currentpos);
  163. readchars:= [#12];
  164. end;
  165. 'a' :
  166. begin
  167. inc(currentpos);
  168. readchars:= [#7];
  169. end;
  170. else
  171. begin //Some basic escaping...
  172. readchars := [currentpos^];
  173. inc (currentpos);
  174. {error:=true;
  175. exit;}
  176. end;
  177. end;
  178. end;
  179. else
  180. begin
  181. if ref_caseinsensitive in flags then
  182. c1:=upcase(currentpos^)
  183. else
  184. c1:=currentpos^;
  185. inc(currentpos);
  186. if currentpos^='-' then
  187. begin
  188. inc(currentpos);
  189. if currentpos^=#0 then
  190. begin
  191. error:=true;
  192. exit;
  193. end;
  194. if ref_caseinsensitive in flags then
  195. readchars:=[c1..upcase(currentpos^)]
  196. else
  197. readchars:=[c1..currentpos^];
  198. inc(currentpos);
  199. end
  200. else
  201. readchars:=[c1];
  202. end;
  203. end;
  204. end;
  205. function readcharset : tcharset;
  206. begin
  207. readcharset:=[];
  208. case currentpos^ of
  209. #0:
  210. exit;
  211. '[':
  212. begin
  213. inc(currentpos);
  214. while currentpos^<>']' do
  215. begin
  216. if currentpos^='^' then
  217. begin
  218. inc(currentpos);
  219. readcharset:=readcharset+(cs_allchars-readchars);
  220. end
  221. else
  222. readcharset:=readcharset+readchars;
  223. if error or (currentpos^=#0) then
  224. begin
  225. error:=true;
  226. exit;
  227. end;
  228. end;
  229. inc(currentpos);
  230. end;
  231. '^':
  232. begin
  233. inc(currentpos);
  234. readcharset:=cs_allchars-readchars;
  235. end;
  236. else
  237. readcharset:=readchars;
  238. end;
  239. end;
  240. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  241. var
  242. hp,hp2,ep : pregexprentry;
  243. cs : tcharset;
  244. chaining : ^pregexprentry;
  245. begin
  246. chaining:=nil;
  247. parseregexpr:=nil;
  248. if error then
  249. exit;
  250. { this dummy allows us to redirect the elsepath later }
  251. new(ep);
  252. doregister(ep);
  253. ep^.typ:=ret_charset;
  254. ep^.chars:=[];
  255. ep^.elsepath:=elsepath;
  256. elsepath:=ep;
  257. while true do
  258. begin
  259. if error then
  260. exit;
  261. case currentpos^ of
  262. '(':
  263. begin
  264. inc(currentpos);
  265. new(hp2);
  266. doregister(hp2);
  267. hp2^.typ:=ret_charset;
  268. hp2^.chars:=[];
  269. hp2^.elsepath:=next;
  270. hp:=parseregexpr(hp2,ep);
  271. if assigned(chaining) then
  272. chaining^:=hp
  273. else
  274. parseregexpr:=hp;
  275. chaining:=@hp2^.elsepath;
  276. if currentpos^<>')' then
  277. begin
  278. error:=true;
  279. exit;
  280. end;
  281. inc(currentpos);
  282. end;
  283. (* '|':
  284. begin
  285. {$ifdef DEBUG}
  286. writeln('Creating backtrace entry');
  287. {$endif DEBUG}
  288. if (not assigned (hp2)) then
  289. new (hp2);
  290. while currentpos^='|' do
  291. begin
  292. inc(currentpos);
  293. if currentpos^=#0 then
  294. begin
  295. error:=true;
  296. exit;
  297. end;
  298. doregister(hp2);
  299. hp2^.typ:=ret_charset;
  300. hp2^.chars:=[];
  301. hp2^.elsepath:=next;
  302. new(hp);
  303. doregister(hp);
  304. hp^.typ:=ret_backtrace;
  305. hp^.elsepath:= parseregexpr (next, elsepath);
  306. hp^.next:=next;
  307. if assigned(chaining) then
  308. chaining^:=hp
  309. else
  310. parseregexpr:=hp;
  311. chaining:=@hp^.elsepath;
  312. end;
  313. exit;
  314. end;
  315. *)
  316. ')':
  317. exit;
  318. '^':
  319. begin
  320. inc(currentpos);
  321. new(hp);
  322. doregister(hp);
  323. hp^.typ:=ret_startline;
  324. hp^.elsepath:=ep;
  325. // hp^.next:=parseregexpr(ep);
  326. end;
  327. '$':
  328. begin
  329. inc(currentpos);
  330. new(hp);
  331. doregister(hp);
  332. hp^.typ:=ret_endline;
  333. hp^.elsepath:=ep;
  334. // hp^.next:=parseregexpr(ep);
  335. end;
  336. #0:
  337. exit;
  338. else
  339. begin
  340. cs:=readcharset;
  341. if error then
  342. exit;
  343. case currentpos^ of
  344. '*':
  345. begin
  346. inc(currentpos);
  347. new(hp);
  348. doregister(hp);
  349. hp^.typ:=ret_charset;
  350. hp^.chars:=cs;
  351. hp^.elsepath:=next;
  352. hp^.next:=hp;
  353. if assigned(chaining) then
  354. chaining^:=hp
  355. else
  356. parseregexpr:=hp;
  357. chaining:=@hp^.elsepath;
  358. end;
  359. '+':
  360. begin
  361. inc(currentpos);
  362. new(hp);
  363. new(hp2);
  364. doregister(hp);
  365. doregister(hp2);
  366. hp^.typ:=ret_charset;
  367. hp2^.typ:=ret_charset;
  368. hp^.chars:=cs;
  369. hp2^.chars:=cs;
  370. hp^.elsepath:=elsepath;
  371. hp^.next:=hp2;
  372. hp2^.elsepath:=next;
  373. hp2^.next:=hp2;
  374. if assigned(chaining) then
  375. chaining^:=hp
  376. else
  377. parseregexpr:=hp;
  378. chaining:=@hp2^.elsepath;
  379. end;
  380. '?':
  381. begin
  382. inc(currentpos);
  383. new(hp);
  384. { this is a dummy }
  385. new(hp2);
  386. doregister(hp);
  387. doregister(hp2);
  388. hp^.typ:=ret_charset;
  389. hp^.chars:=cs;
  390. hp^.next:=hp2;
  391. hp^.elsepath:=hp2;
  392. hp2^.typ:=ret_charset;
  393. hp2^.chars:=[];
  394. hp2^.elsepath:=next;
  395. if assigned(chaining) then
  396. chaining^:=hp
  397. else
  398. parseregexpr:=hp;
  399. chaining:=@hp2^.elsepath;
  400. end;
  401. else
  402. begin
  403. new(hp);
  404. doregister(hp);
  405. hp^.typ:=ret_charset;
  406. hp^.chars:=cs;
  407. hp^.elsepath:=elsepath;
  408. hp^.next:=next;
  409. if assigned(chaining) then
  410. chaining^:=hp
  411. else
  412. parseregexpr:=hp;
  413. chaining:=@hp^.next;
  414. end;
  415. end;
  416. end;
  417. end;
  418. end;
  419. end;
  420. var
  421. endp : pregexprentry;
  422. begin
  423. GenerateRegExprEngine.Data:=nil;
  424. GenerateRegExprEngine.DestroyList:=nil;
  425. if regexpr=nil then
  426. exit;
  427. first:=nil;
  428. if (ref_singleline in flags) and (ref_multiline in flags) then
  429. exit;
  430. currentpos:=regexpr;
  431. error:=false;
  432. new(endp);
  433. doregister(endp);
  434. endp^.typ:=ret_illegalend;
  435. GenerateRegExprEngine.flags:=flags;
  436. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  437. GenerateRegExprEngine.DestroyList:=first;
  438. if error or (currentpos^<>#0) then
  439. DestroyRegExprEngine(Result);
  440. end;
  441. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  442. var
  443. hp : pregexprentry;
  444. begin
  445. hp:=regexpr.DestroyList;
  446. while assigned(hp) do
  447. begin
  448. regexpr.DestroyList:=hp^.nextdestroy;
  449. dispose(hp);
  450. hp:=regexpr.DestroyList;
  451. end;
  452. regexpr.Data:=nil;
  453. regexpr.DestroyList:=nil;
  454. end;
  455. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  456. var
  457. lastpos : pchar;
  458. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  459. begin
  460. dosearch:=false;
  461. while true do
  462. begin
  463. {$IFDEF Debug}
  464. writeln(byte(regexpr^.typ));
  465. {$ENDIF Debug}
  466. case regexpr^.typ of
  467. ret_endline:
  468. begin
  469. if ref_multiline in regexprengine.flags then
  470. begin
  471. if ((pos+1)^ in [#10,#0]) then
  472. regexpr:=regexpr^.next
  473. else
  474. regexpr:=regexpr^.elsepath;
  475. end
  476. else
  477. begin
  478. if (pos+1)^=#0 then
  479. regexpr:=regexpr^.next
  480. else
  481. regexpr:=regexpr^.elsepath;
  482. end;
  483. end;
  484. ret_startline:
  485. begin
  486. if ref_multiline in regexprengine.flags then
  487. begin
  488. if (pos=p) or ((pos-1)^=#10) then
  489. regexpr:=regexpr^.next
  490. else
  491. regexpr:=regexpr^.elsepath;
  492. end
  493. else
  494. begin
  495. if pos=p then
  496. regexpr:=regexpr^.next
  497. else
  498. regexpr:=regexpr^.elsepath;
  499. end;
  500. end;
  501. ret_charset:
  502. begin
  503. if (pos^ in regexpr^.chars) or
  504. ((ref_caseinsensitive in regexprengine.flags) and
  505. (upcase(pos^) in regexpr^.chars)) then
  506. begin
  507. {$ifdef DEBUG}
  508. writeln('Found matching: ',pos^);
  509. {$endif DEBUG}
  510. regexpr:=regexpr^.next;
  511. inc(pos);
  512. end
  513. else
  514. begin
  515. {$ifdef DEBUG}
  516. writeln('Found unmatching: ',pos^);
  517. {$endif DEBUG}
  518. regexpr:=regexpr^.elsepath;
  519. end;
  520. end;
  521. ret_backtrace:
  522. begin
  523. {$ifdef DEBUG}
  524. writeln('Starting backtrace');
  525. {$endif DEBUG}
  526. if dosearch(regexpr^.next,pos) then
  527. begin
  528. dosearch:=true;
  529. exit;
  530. end
  531. else if dosearch(regexpr^.elsepath,pos) then
  532. begin
  533. dosearch:=true;
  534. exit;
  535. end
  536. else
  537. exit;
  538. end;
  539. end;
  540. lastpos:=pos;
  541. if regexpr=nil then
  542. begin
  543. dosearch:=true;
  544. exit;
  545. end;
  546. if regexpr^.typ=ret_illegalend then
  547. exit;
  548. if pos^=#0 then
  549. exit;
  550. end;
  551. end;
  552. begin
  553. RegExprPos:=false;
  554. index:=0;
  555. len:=0;
  556. if regexprengine.Data=nil then
  557. exit;
  558. while p^<>#0 do
  559. begin
  560. if dosearch(regexprengine.Data,p) then
  561. begin
  562. len:=lastpos-p;
  563. RegExprPos:=true;
  564. exit;
  565. end
  566. else
  567. begin
  568. inc(p);
  569. inc(index);
  570. end;
  571. end;
  572. index:=-1;
  573. end;
  574. function RegExprEscapeStr (const S : AnsiString) : AnsiString;
  575. var
  576. i, len : SizeUInt;
  577. begin
  578. Result := '';
  579. if (S = '') then
  580. exit;
  581. SetLength(Result,Length(S)*2);
  582. len := Length (S);
  583. for i := 1 to len do
  584. begin
  585. if (S [i] in ['(','|', '.', '*', '?', '^', '$', '-', '[', '{', '}', ']', ')', '\']) then
  586. begin
  587. Result := Result + '\';
  588. end;
  589. Result := Result + S[i];
  590. end;
  591. SetLength(Result,Length(Result));
  592. end;
  593. begin
  594. cs_nonwordchars:=cs_allchars-cs_wordchars;
  595. cs_nondigits:=cs_allchars-cs_digits;
  596. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  597. end.