regexpr.pp 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204
  1. {
  2. This unit implements basic regular expression support
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000-2006 by Florian Klaempfland Carl Eric Codere
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {.$ define DEBUG}
  12. (*
  13. - newline handling (uses all known formats of ASCII, #10,#13,#13#10 and #$85
  14. TODO:
  15. - correct backtracking, for example in (...)*
  16. - full | support (currently requires to put all items with | operator
  17. between parenthesis (in a group) to take care over order priority.
  18. Therefore the following would work: (foo)|(nofoo) but not
  19. foo|nofoo
  20. - getting substrings and using substrings with \1 etc.
  21. - do we treat several multiline characters in a row as a single
  22. newline character for $ and ^?
  23. *)
  24. {$IFDEF FPC}
  25. {$mode objfpc}
  26. {$ENDIF}
  27. {** @abstract(Regular expression unit)
  28. This unit implements a basic regular expression parser that mostly conforms
  29. to the POSIX extended-regular expression syntax. It also supports standard
  30. escape characters for patterns (as defined in PERL).
  31. }
  32. unit regexpr;
  33. interface
  34. { the following declarions are only in the interface because }
  35. { some procedures return pregexprentry but programs which }
  36. { use this unit shouldn't access this data structures }
  37. type
  38. tcharset = set of char;
  39. tregexprentrytype = (ret_charset,ret_or,
  40. ret_illegalend,ret_backtrace,ret_startline,
  41. ret_endline,ret_pattern);
  42. pregexprentry = ^tregexprentry;
  43. tregexprentry = record
  44. next,nextdestroy : pregexprentry;
  45. case typ : tregexprentrytype of
  46. ret_charset : (chars : tcharset; elsepath : pregexprentry);
  47. {** This is a complete pattern path ()+ , ()* or ()?, n,m }
  48. ret_pattern: (pattern: pregexprentry; minoccurs: integer; maxoccurs: integer; alternative : pregexprentry);
  49. end;
  50. tregexprflag = (
  51. ref_singleline,
  52. {** This indicates that a start of line is either the
  53. start of the pattern or a linebreak. }
  54. ref_multiline,
  55. {** The match will be done in a case-insensitive way
  56. according to US-ASCII character set. }
  57. ref_caseinsensitive);
  58. tregexprflags = set of tregexprflag;
  59. TRegExprEngine = record
  60. Data : pregexprentry;
  61. DestroyList : pregexprentry;
  62. Flags : TRegExprFlags;
  63. end;
  64. const
  65. cs_allchars : tcharset = [#0..#255];
  66. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  67. cs_newline : tcharset = [#10];
  68. cs_digits : tcharset = ['0'..'9'];
  69. cs_whitespace : tcharset = [' ',#9];
  70. var
  71. { these are initilized in the init section of the unit }
  72. cs_nonwordchars : tcharset;
  73. cs_nondigits : tcharset;
  74. cs_nonwhitespace : tcharset;
  75. { the following procedures can be used by units basing }
  76. { on the regexpr unit }
  77. {** From a regular expression, compile an encoded version of the regular expression.
  78. @param(regexpr Regular expression to compile)
  79. @param(flags Flags relating to the type of parsing that will occur)
  80. @param(RegExprEngine The actual encoded version of the regular expression)
  81. @returns(true if success, otherwise syntax error in compiling regular expression)
  82. }
  83. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags;var RegExprEngine: TRegExprEngine): boolean;
  84. {$IFDEF FPC}
  85. {** Backward compatibility routine }
  86. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags): TREGExprEngine;
  87. {$ENDIF}
  88. {** Frees all up resources used for the encoded regular expression }
  89. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  90. {** @abstract(Matches a regular expression)
  91. @param(RegExprEngine The actual compiled regular expression to match against)
  92. @param(p The text to search for for a match)
  93. @param(index zero-based index to the start of the match -1 if no match in p)
  94. @param(len length of the match)
  95. @returns(true if there was a match, otherwise false)
  96. }
  97. function RegExprPos(RegExprEngine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  98. function RegExprReplaceAll(RegExprEngine : TRegExprEngine;const src,newstr : ansistring;var dest : ansistring) : sizeint;
  99. { This function Escape known regex chars and place the result on Return. If something went wrong the
  100. function will return false. }
  101. function RegExprEscapeStr (const S : string) : string;
  102. implementation
  103. {$ifdef DEBUG}
  104. procedure writecharset(c : tcharset);
  105. var
  106. b : byte;
  107. begin
  108. for b:=20 to 255 do
  109. if chr(b) in c then
  110. write(chr(b));
  111. writeln;
  112. end;
  113. const
  114. typ2str : array[tregexprentrytype] of string =
  115. (
  116. 'ret_charset',
  117. 'ret_or',
  118. 'ret_illegalend',
  119. 'ret_backtrace',
  120. 'ret_startline',
  121. 'ret_endline',
  122. 'ret_pattern'
  123. );
  124. { Dumps all the next elements of a tree }
  125. procedure dumptree(space: string; regentry: pregexprentry);
  126. begin
  127. while assigned(regentry) do
  128. begin
  129. WriteLn(space+'------- Node Type ',typ2str[regentry^.typ]);
  130. if (regentry^.typ = ret_charset) then
  131. WriteCharSet(regentry^.chars);
  132. { dump embedded pattern information }
  133. if regentry^.typ = ret_pattern then
  134. begin
  135. dumptree(space+#9,regentry^.pattern);
  136. WriteLn(space+#9,' --- Alternative nodes ');
  137. if assigned(regentry^.alternative) then
  138. dumptree(space+#9#9,regentry^.alternative);
  139. end;
  140. if regentry^.typ = ret_startline then
  141. dumptree(space+#9,regentry^.pattern);
  142. regentry:=regentry^.next;
  143. end;
  144. end;
  145. {$endif DEBUG}
  146. {** Determines the length of a pattern, including sub-patterns.
  147. It goes through the nodes and returns the pattern length
  148. between the two, using minOccurs as required.
  149. Called recursively.
  150. }
  151. function patlength(hp1: pregexprentry): integer;
  152. var
  153. count: integer;
  154. hp: pregexprentry;
  155. begin
  156. count:=0;
  157. if hp1^.typ=ret_pattern then
  158. hp:=hp1^.pattern
  159. else
  160. hp:=hp1;
  161. { now go through all chars and get the length
  162. does not currently take care of embedded patterns
  163. }
  164. while assigned(hp) do
  165. begin
  166. if hp^.typ = ret_pattern then
  167. begin
  168. inc(count,patlength(hp));
  169. end
  170. else
  171. if hp^.typ = ret_charset then
  172. inc(count);
  173. hp:=hp^.next;
  174. end;
  175. if hp1^.typ=ret_pattern then
  176. begin
  177. count:=hp1^.minOccurs*count;
  178. end;
  179. patlength:=count;
  180. end;
  181. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags; var RegExprEngine:TRegExprEngine) : boolean;
  182. var
  183. first : pregexprentry;
  184. procedure doregister(p : pregexprentry);
  185. begin
  186. p^.nextdestroy:=first;
  187. first:=p;
  188. end;
  189. var
  190. currentpos : pchar;
  191. error : boolean;
  192. procedure readchars(var chars: tcharset);
  193. var
  194. c1 : char;
  195. begin
  196. chars:=[];
  197. case currentpos^ of
  198. #0:
  199. exit;
  200. '.':
  201. begin
  202. inc(currentpos);
  203. chars:=cs_allchars-cs_newline;
  204. end;
  205. '\':
  206. begin
  207. inc(currentpos);
  208. case currentpos^ of
  209. #0:
  210. begin
  211. error:=true;
  212. exit;
  213. end;
  214. 't':
  215. begin
  216. inc(currentpos);
  217. chars:=[#9];
  218. end;
  219. 'n':
  220. begin
  221. inc(currentpos);
  222. chars:=[#10];
  223. end;
  224. 'r':
  225. begin
  226. inc(currentpos);
  227. chars:=[#13];
  228. end;
  229. 'd':
  230. begin
  231. inc(currentpos);
  232. chars:=cs_digits;
  233. end;
  234. 'D':
  235. begin
  236. inc(currentpos);
  237. chars:=cs_nondigits;
  238. end;
  239. 's':
  240. begin
  241. inc(currentpos);
  242. chars:=cs_whitespace;
  243. end;
  244. 'S':
  245. begin
  246. inc(currentpos);
  247. chars:=cs_nonwhitespace;
  248. end;
  249. 'w':
  250. begin
  251. inc(currentpos);
  252. chars:=cs_wordchars;
  253. end;
  254. 'W':
  255. begin
  256. inc(currentpos);
  257. chars:=cs_nonwordchars;
  258. end;
  259. 'f' :
  260. begin
  261. inc(currentpos);
  262. chars:= [#12];
  263. end;
  264. 'a' :
  265. begin
  266. inc(currentpos);
  267. chars:= [#7];
  268. end;
  269. else
  270. begin { Some basic escaping...}
  271. chars := [currentpos^];
  272. inc (currentpos);
  273. {error:=true;
  274. exit;}
  275. end;
  276. end;
  277. end;
  278. else
  279. begin
  280. if ref_caseinsensitive in flags then
  281. c1:=upcase(currentpos^)
  282. else
  283. c1:=currentpos^;
  284. inc(currentpos);
  285. if currentpos^='-' then
  286. begin
  287. inc(currentpos);
  288. if currentpos^=#0 then
  289. begin
  290. error:=true;
  291. exit;
  292. end;
  293. if ref_caseinsensitive in flags then
  294. chars:=[c1..upcase(currentpos^)]
  295. else
  296. chars:=[c1..currentpos^];
  297. inc(currentpos);
  298. end
  299. else
  300. chars:=[c1];
  301. end;
  302. end;
  303. end;
  304. procedure readcharset(var charset: tcharset);
  305. var
  306. chars: tcharset;
  307. begin
  308. charset:=[];
  309. case currentpos^ of
  310. #0:
  311. exit;
  312. '[':
  313. begin
  314. inc(currentpos);
  315. while currentpos^<>']' do
  316. begin
  317. if currentpos^='^' then
  318. begin
  319. inc(currentpos);
  320. readchars(chars);
  321. charset:=charset+(cs_allchars-chars);
  322. end
  323. else
  324. begin
  325. readchars(chars);
  326. charset:=charset+chars;
  327. end;
  328. if error or (currentpos^=#0) then
  329. begin
  330. error:=true;
  331. exit;
  332. end;
  333. end;
  334. inc(currentpos);
  335. end;
  336. '^':
  337. begin
  338. inc(currentpos);
  339. readchars(chars);
  340. charset:=cs_allchars-chars;
  341. end;
  342. else
  343. begin
  344. readchars(chars);
  345. charset:=chars;
  346. end;
  347. end;
  348. end;
  349. (* takes care of parsing the {n}, {n,} and {n,m} regular expression
  350. elements. In case of error, sets error to true and returns false,
  351. otherwise returns true and set minoccurs and maxoccurs accordingly
  352. (-1 if not present). *)
  353. function parseoccurences(var currentpos: pchar; var minoccurs,maxoccurs: integer): boolean;
  354. var
  355. minOccursString: string;
  356. maxOccursString: string;
  357. begin
  358. parseoccurences:=false;
  359. minOccurs:=-1;
  360. maxOccurs:=-1;
  361. inc(currentpos);
  362. minOccursString:='';
  363. if currentPos^ = #0 then
  364. begin
  365. error:=true;
  366. exit;
  367. end;
  368. while (currentpos^<>#0) and (currentpos^ in ['0'..'9']) do
  369. begin
  370. minOccursString:=minOccursString+currentPos^;
  371. inc(currentpos);
  372. end;
  373. if length(minOccursString) = 0 then
  374. begin
  375. error:=true;
  376. exit;
  377. end;
  378. Val(minOccursString,minOccurs);
  379. { possible cases here: commad or end bracket }
  380. if currentpos^= '}' then
  381. begin
  382. inc(currentpos);
  383. maxOccurs:=minOccurs;
  384. parseoccurences:=true;
  385. exit;
  386. end;
  387. if currentpos^= ',' then
  388. begin
  389. maxOccursString:='';
  390. inc(currentpos);
  391. while (currentpos^<>#0) and (currentpos^ in ['0'..'9']) do
  392. begin
  393. maxOccursString:=maxOccursString+currentPos^;
  394. inc(currentpos);
  395. end;
  396. if currentpos^= '}' then
  397. begin
  398. { If the length of the string is zero, then there is
  399. no upper bound. }
  400. if length(maxOccursString) > 0 then
  401. Val(maxOccursString,maxOccurs)
  402. else
  403. maxOccurs:=high(integer);
  404. inc(currentpos);
  405. parseoccurences:=true;
  406. exit;
  407. end;
  408. end;
  409. error:=true;
  410. end;
  411. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  412. var
  413. hp : pregexprentry;
  414. minOccurs,maxOccurs: integer;
  415. hp3: pregexprentry;
  416. cs : tcharset;
  417. chaining : ^pregexprentry;
  418. begin
  419. chaining:=nil;
  420. parseregexpr:=nil;
  421. elsepath:=nil;
  422. if error then
  423. exit;
  424. { this dummy allows us to redirect the elsepath later }
  425. { new(ep);
  426. doregister(ep);
  427. ep^.typ:=ret_charset;
  428. ep^.chars:=[];
  429. ep^.elsepath:=elsepath;
  430. elsepath:=ep;}
  431. while true do
  432. begin
  433. if error then
  434. exit;
  435. case currentpos^ of
  436. '(':
  437. begin
  438. inc(currentpos);
  439. hp:=parseregexpr(nil,nil);
  440. { Special characters after the bracket }
  441. if error then
  442. exit;
  443. if currentpos^<>')' then
  444. begin
  445. error:=true;
  446. exit;
  447. end;
  448. inc(currentpos);
  449. case currentpos^ of
  450. '*':
  451. begin
  452. inc(currentpos);
  453. new(hp3);
  454. doregister(hp3);
  455. hp3^.typ:=ret_pattern;
  456. hp3^.alternative:=nil;
  457. hp3^.pattern:=hp;
  458. hp3^.elsepath:=elsepath;
  459. hp3^.minoccurs:=0;
  460. hp3^.maxoccurs:=high(integer);
  461. hp3^.next:=nil;
  462. if assigned(chaining) then
  463. chaining^:=hp3
  464. else
  465. parseregexpr:=hp3;
  466. chaining:=@hp3^.next;
  467. end;
  468. '+':
  469. begin
  470. inc(currentpos);
  471. new(hp3);
  472. doregister(hp3);
  473. hp3^.typ:=ret_pattern;
  474. hp3^.alternative:=nil;
  475. hp3^.pattern:=hp;
  476. hp3^.elsepath:=elsepath;
  477. hp3^.minoccurs:=1;
  478. hp3^.maxoccurs:=high(integer);
  479. hp3^.next:=nil;
  480. if assigned(chaining) then
  481. chaining^:=hp3
  482. else
  483. parseregexpr:=hp3;
  484. chaining:=@hp3^.next;
  485. end;
  486. '?':
  487. begin
  488. inc(currentpos);
  489. new(hp3);
  490. doregister(hp3);
  491. hp3^.typ:=ret_pattern;
  492. hp3^.alternative:=nil;
  493. hp3^.pattern:=hp;
  494. hp3^.elsepath:=elsepath;
  495. hp3^.minoccurs:=0;
  496. hp3^.maxoccurs:=1;
  497. hp3^.next:=nil;
  498. if assigned(chaining) then
  499. chaining^:=hp3
  500. else
  501. parseregexpr:=hp3;
  502. chaining:=@hp3^.next;
  503. end;
  504. '{':
  505. begin
  506. if not parseOccurences(currentPos,minOccurs,maxOccurs) then
  507. exit;
  508. inc(currentpos);
  509. new(hp3);
  510. doregister(hp3);
  511. hp3^.typ:=ret_pattern;
  512. hp3^.alternative:=nil;
  513. hp3^.pattern:=hp;
  514. hp3^.elsepath:=elsepath;
  515. hp3^.minoccurs:=minOccurs;
  516. hp3^.maxoccurs:=maxOccurs;
  517. hp3^.next:=nil;
  518. if assigned(chaining) then
  519. chaining^:=hp3
  520. else
  521. parseregexpr:=hp3;
  522. chaining:=@hp3^.next;
  523. end;
  524. else
  525. begin
  526. { go to end of this list - always the
  527. last next used }
  528. (*
  529. hp3:=hp;
  530. while assigned(hp3^.next) do
  531. begin
  532. hp3:=hp3^.next;
  533. end;
  534. if assigned(chaining) then
  535. chaining^:=hp
  536. else
  537. parseregexpr:=hp;
  538. chaining:=@hp3^.next;*)
  539. new(hp3);
  540. doregister(hp3);
  541. hp3^.typ:=ret_pattern;
  542. hp3^.alternative:=nil;
  543. hp3^.pattern:=hp;
  544. hp3^.elsepath:=elsepath;
  545. hp3^.minoccurs:=1;
  546. hp3^.maxoccurs:=1;
  547. hp3^.next:=nil;
  548. if assigned(chaining) then
  549. chaining^:=hp3
  550. else
  551. parseregexpr:=hp3;
  552. chaining:=@hp3^.next;
  553. end;
  554. end;
  555. end;
  556. { This is only partially implemented currently, as the terms before
  557. the | character must be grouped together with parenthesis, which
  558. is also compatible with other regular expressions.
  559. }
  560. '|':
  561. begin
  562. {$ifdef DEBUG}
  563. writeln('Creating or entry');
  564. {$endif DEBUG}
  565. if (not assigned (hp3)) then
  566. begin
  567. error:=true;
  568. exit;
  569. end;
  570. if (hp3^.typ <> ret_pattern) then
  571. begin
  572. error:=true;
  573. exit;
  574. end;
  575. while currentpos^='|' do
  576. begin
  577. inc(currentpos);
  578. if currentpos^=#0 then
  579. begin
  580. error:=true;
  581. exit;
  582. end;
  583. { always put the longest pattern first, so
  584. swap the trees as necessary.
  585. }
  586. hp := parseregexpr (next, elsepath);
  587. if patlength(hp) > patlength(hp3^.pattern) then
  588. begin
  589. hp3^.alternative:=hp3^.pattern;
  590. hp3^.pattern:=hp;
  591. end
  592. else
  593. hp3^.alternative:=hp;
  594. end;
  595. end;
  596. ')':
  597. exit;
  598. '^':
  599. begin
  600. inc(currentpos);
  601. hp:=parseregexpr(nil,nil);
  602. { Special characters after the bracket }
  603. if error then
  604. exit;
  605. new(hp3);
  606. doregister(hp3);
  607. hp3^.typ:=ret_startline;
  608. hp3^.pattern:=hp;
  609. hp3^.elsepath:=elsepath;
  610. hp3^.next:=nil;
  611. if assigned(chaining) then
  612. chaining^:=hp3
  613. else
  614. parseregexpr:=hp3;
  615. chaining:=@hp3^.next;
  616. end;
  617. '$':
  618. begin
  619. inc(currentpos);
  620. new(hp);
  621. doregister(hp);
  622. hp^.typ:=ret_endline;
  623. hp^.elsepath:=elsepath;
  624. hp^.next:=nil;
  625. if assigned(chaining) then
  626. chaining^:=hp
  627. else
  628. parseregexpr:=hp;
  629. chaining:=@hp^.next;
  630. end;
  631. #0:
  632. exit;
  633. else
  634. begin
  635. readcharset(cs);
  636. if error then
  637. exit;
  638. case currentpos^ of
  639. '*':
  640. begin
  641. inc(currentpos);
  642. new(hp);
  643. doregister(hp);
  644. hp^.typ:=ret_charset;
  645. hp^.chars:=cs;
  646. hp^.elsepath:=nil;
  647. hp^.next:=nil;
  648. new(hp3);
  649. doregister(hp3);
  650. hp3^.typ:=ret_pattern;
  651. hp3^.alternative:=nil;
  652. hp3^.pattern:=hp;
  653. hp3^.elsepath:=elsepath;
  654. hp3^.minoccurs:=0;
  655. hp3^.maxoccurs:=high(integer);
  656. hp3^.next:=nil;
  657. if assigned(chaining) then
  658. chaining^:=hp3
  659. else
  660. parseregexpr:=hp3;
  661. chaining:=@hp3^.next;
  662. end;
  663. '+':
  664. begin
  665. inc(currentpos);
  666. new(hp);
  667. doregister(hp);
  668. hp^.typ:=ret_charset;
  669. hp^.chars:=cs;
  670. hp^.elsepath:=nil;
  671. hp^.next:=nil;
  672. new(hp3);
  673. doregister(hp3);
  674. hp3^.typ:=ret_pattern;
  675. hp3^.alternative:=nil;
  676. hp3^.pattern:=hp;
  677. hp3^.elsepath:=elsepath;
  678. hp3^.minoccurs:=1;
  679. hp3^.maxoccurs:=high(integer);
  680. hp3^.next:=nil;
  681. if assigned(chaining) then
  682. chaining^:=hp3
  683. else
  684. parseregexpr:=hp3;
  685. chaining:=@hp3^.next;
  686. end;
  687. '?':
  688. begin
  689. inc(currentpos);
  690. new(hp);
  691. doregister(hp);
  692. hp^.typ:=ret_charset;
  693. hp^.chars:=cs;
  694. hp^.elsepath:=nil;
  695. hp^.next:=nil;
  696. new(hp3);
  697. doregister(hp3);
  698. hp3^.typ:=ret_pattern;
  699. hp3^.pattern:=hp;
  700. hp3^.alternative:=nil;
  701. hp3^.elsepath:=elsepath;
  702. hp3^.minoccurs:=0;
  703. hp3^.maxoccurs:=1;
  704. hp3^.next:=nil;
  705. if assigned(chaining) then
  706. chaining^:=hp3
  707. else
  708. parseregexpr:=hp3;
  709. chaining:=@hp3^.next;
  710. end;
  711. '{':
  712. begin
  713. if not parseOccurences(currentPos,minOccurs,maxOccurs) then
  714. exit;
  715. new(hp);
  716. doregister(hp);
  717. hp^.typ:=ret_charset;
  718. hp^.chars:=cs;
  719. hp^.elsepath:=nil;
  720. hp^.next:=nil;
  721. new(hp3);
  722. doregister(hp3);
  723. hp3^.typ:=ret_pattern;
  724. hp3^.alternative:=nil;
  725. hp3^.pattern:=hp;
  726. hp3^.elsepath:=elsepath;
  727. hp3^.minoccurs:=minOccurs;
  728. hp3^.maxoccurs:=maxOccurs;
  729. hp3^.next:=nil;
  730. if assigned(chaining) then
  731. chaining^:=hp3
  732. else
  733. parseregexpr:=hp3;
  734. chaining:=@hp3^.next;
  735. end;
  736. else
  737. { Normal character }
  738. begin
  739. new(hp);
  740. doregister(hp);
  741. hp^.typ:=ret_charset;
  742. hp^.chars:=cs;
  743. hp^.elsepath:=elsepath;
  744. hp^.next:=next;
  745. if assigned(chaining) then
  746. chaining^:=hp
  747. else
  748. parseregexpr:=hp;
  749. chaining:=@hp^.next;
  750. continue;
  751. end;
  752. { This was a pattern }
  753. end; { END CASE }
  754. end;
  755. end;
  756. end;
  757. end;
  758. var
  759. endp : pregexprentry;
  760. begin
  761. GenerateRegExprEngine:=false;
  762. RegExprEngine.Data:=nil;
  763. RegExprEngine.DestroyList:=nil;
  764. if regexpr=nil then
  765. exit;
  766. first:=nil;
  767. if (ref_singleline in flags) and (ref_multiline in flags) then
  768. exit;
  769. currentpos:=regexpr;
  770. GenerateRegExprEngine:=true;
  771. error:=false;
  772. new(endp);
  773. doregister(endp);
  774. endp^.typ:=ret_illegalend;
  775. RegExprEngine.flags:=flags;
  776. RegExprEngine.Data:=parseregexpr(nil,endp);
  777. {$IFDEF DEBUG}
  778. writeln('========== Generating tree ============');
  779. dumptree('',RegExprEngine.Data);
  780. {$ENDIF}
  781. RegExprEngine.DestroyList:=first;
  782. if error or (currentpos^<>#0) then
  783. begin
  784. GenerateRegExprEngine:=false;
  785. DestroyRegExprEngine(RegExprEngine);
  786. end;
  787. end;
  788. {$IFDEF FPC}
  789. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags): TREGExprEngine;
  790. var
  791. r: TRegExprEngine;
  792. begin
  793. GenerateRegExprEngine(regexpr,flags,r);
  794. GenerateRegExprEngine:=r;
  795. end;
  796. {$ENDIF}
  797. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  798. var
  799. hp : pregexprentry;
  800. begin
  801. hp:=regexpr.DestroyList;
  802. while assigned(hp) do
  803. begin
  804. regexpr.DestroyList:=hp^.nextdestroy;
  805. dispose(hp);
  806. hp:=regexpr.DestroyList;
  807. end;
  808. regexpr.Data:=nil;
  809. regexpr.DestroyList:=nil;
  810. end;
  811. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  812. var
  813. lastpos : pchar;
  814. firstpos: pchar;
  815. { Does the actual search of the data - return true if the term was found }
  816. function dosearch(regexprentry : pregexprentry;pos : pchar) : boolean;
  817. var
  818. found: boolean;
  819. checkvalue: boolean;
  820. savedpos: pchar;
  821. counter: word;
  822. begin
  823. dosearch:=false;
  824. while true do
  825. begin
  826. {$IFDEF Debug}
  827. writeln('Entering ',typ2str[regexprentry^.typ]);
  828. writeln('Pattern length ',patlength(regexprentry));
  829. {$ENDIF Debug}
  830. case regexprentry^.typ of
  831. ret_endline:
  832. begin
  833. { automatically a match! }
  834. if pos^ = #0 then
  835. begin
  836. dosearch:=true;
  837. exit;
  838. end;
  839. if ref_multiline in regexprengine.flags then
  840. begin
  841. { Supports DOS/Commodore/UNIX/IBM Mainframe line formats }
  842. { avoid reading invalid memory also }
  843. if (pos^=#13) and ((pos+1)^=#10) then
  844. begin
  845. regexprentry:=regexprentry^.next;
  846. end
  847. else
  848. if (pos^=#$85) or (pos^=#10) or ((pos^=#13) and ((pos-1) >= firstpos) and ((pos-1)^ <> #10)) then
  849. begin
  850. regexprentry:=regexprentry^.next;
  851. end
  852. else
  853. begin
  854. dosearch:=false;
  855. exit;
  856. end;
  857. end
  858. else
  859. exit;
  860. end;
  861. ret_pattern:
  862. begin
  863. found:=false;
  864. { Take care of occurences here }
  865. savedpos:=pos;
  866. counter:=0;
  867. repeat
  868. found:=dosearch(regexprentry^.pattern,pos);
  869. if not found then
  870. break;
  871. pos:=lastpos;
  872. inc(counter);
  873. until (not found) or (counter >= regexprentry^.maxoccurs) or (pos^= #0);
  874. if counter = 0 then
  875. begin
  876. { If there was no occurence and the minimum occurence is > 0 then
  877. problem.
  878. }
  879. if (regexprentry^.minoccurs > 0) then
  880. begin
  881. dosearch:=false;
  882. { verify alternative path as required }
  883. if assigned(regexprentry^.alternative) then
  884. begin
  885. dosearch:=dosearch(regexprentry^.alternative,savedpos);
  886. exit;
  887. end;
  888. exit;
  889. end;
  890. dosearch:=true;
  891. lastpos:=savedpos;
  892. end
  893. else
  894. { found }
  895. begin
  896. { Possible choices :
  897. - found and (counter >= minOccurences) and (counter =< maxOccurences) = true
  898. - found and (counter < minOccurences) or (counter > maxOccurences) = false
  899. }
  900. if (counter < regexprentry^.minoccurs) or (counter > regexprentry^.maxoccurs) then
  901. begin
  902. dosearch:=false;
  903. exit;
  904. end;
  905. dosearch:=true;
  906. { if all matches were found, and the current position
  907. points to zero (processed all characters) }
  908. if pos^=#0 then
  909. begin
  910. dosearch:=true;
  911. exit;
  912. end;
  913. end;
  914. { If we are that means the matches were valid, go to next element to match
  915. }
  916. regexprentry:=regexprentry^.next;
  917. if (counter = 0) and not assigned(regexprentry) then
  918. exit;
  919. end;
  920. ret_startline:
  921. begin
  922. checkvalue:=pos=firstpos;
  923. if ref_multiline in regexprengine.flags then
  924. begin
  925. { Supports DOS/Commodore/UNIX/IBM Mainframe line formats }
  926. { avoid reading invalid memory also }
  927. if
  928. (
  929. ((pos-1) >= firstpos) and ((pos-1)^=#$85)
  930. )
  931. or
  932. (
  933. ((pos-1) >= firstpos) and ((pos-1)^=#10)
  934. )
  935. or
  936. (
  937. ((pos-1) >= firstpos) and ((pos-1)^=#13) and
  938. ((pos)^ <> #10)
  939. )
  940. then
  941. begin
  942. checkvalue:=true;
  943. end;
  944. end;
  945. if checkvalue then
  946. begin
  947. dosearch:=dosearch(regexprentry^.pattern,pos);
  948. regexprentry:=regexprentry^.next;
  949. if not dosearch then
  950. exit;
  951. pos:=lastpos;
  952. end
  953. else
  954. begin
  955. dosearch:=false;
  956. exit;
  957. end;
  958. end;
  959. ret_charset:
  960. begin
  961. if (pos^ in regexprentry^.chars) or
  962. ((ref_caseinsensitive in regexprengine.flags) and
  963. (upcase(pos^) in regexprentry^.chars)) then
  964. begin
  965. {$ifdef DEBUG}
  966. writeln('Found matching: ',pos^);
  967. {$endif DEBUG}
  968. regexprentry:=regexprentry^.next;
  969. inc(pos);
  970. end
  971. else
  972. begin
  973. {$ifdef DEBUG}
  974. writeln('Found unmatching: ',pos^);
  975. {$endif DEBUG}
  976. exit;
  977. end;
  978. end;
  979. ret_backtrace:
  980. begin
  981. {$ifdef DEBUG}
  982. writeln('Starting backtrace');
  983. {$endif DEBUG}
  984. if dosearch(regexprentry^.next,pos) then
  985. begin
  986. dosearch:=true;
  987. exit;
  988. end
  989. else if dosearch(regexprentry^.elsepath,pos) then
  990. begin
  991. dosearch:=true;
  992. exit;
  993. end
  994. else
  995. exit;
  996. end;
  997. end;
  998. lastpos:=pos;
  999. if regexprentry=nil then
  1000. begin
  1001. dosearch:=true;
  1002. exit;
  1003. end;
  1004. if regexprentry^.typ=ret_illegalend then
  1005. exit;
  1006. { end of string, and we were expecting an end of string }
  1007. if (pos^=#0) and (regexprentry^.typ = ret_endline) and
  1008. (not assigned(regexprentry^.next)) then
  1009. begin
  1010. dosearch:=true;
  1011. exit;
  1012. end;
  1013. if pos^=#0 then
  1014. exit;
  1015. end;
  1016. end;
  1017. begin
  1018. RegExprPos:=false;
  1019. index:=0;
  1020. len:=0;
  1021. firstpos:=p;
  1022. if regexprengine.Data=nil then
  1023. exit;
  1024. while p^<>#0 do
  1025. begin
  1026. if dosearch(regexprengine.Data,p) then
  1027. begin
  1028. len:=lastpos-p;
  1029. RegExprPos:=true;
  1030. exit;
  1031. end
  1032. else
  1033. begin
  1034. inc(p);
  1035. inc(index);
  1036. end;
  1037. end;
  1038. index:=-1;
  1039. end;
  1040. function RegExprReplaceAll(RegExprEngine : TRegExprEngine;const src,newstr : ansistring;var dest : ansistring) : sizeint;
  1041. var
  1042. index,len : longint;
  1043. pos,lastpos : pchar;
  1044. first : boolean;
  1045. oldlength : PtrInt;
  1046. begin
  1047. pos:=pchar(src);
  1048. lastpos:=pos;
  1049. first:=true;
  1050. Result:=0;
  1051. { estimate some length }
  1052. SetLength(dest,length(src)+((length(src) div 10)*length(newstr)));
  1053. while RegExprPos(RegExprEngine,pos,index,len) do
  1054. begin
  1055. inc(pos,index);
  1056. if pos>lastpos then
  1057. begin
  1058. { copy skipped part }
  1059. { because we cheat with SetLength a SetLength(...,0) isn't what we want
  1060. so we've to trick at the first SetLength call
  1061. }
  1062. if first then
  1063. begin
  1064. SetLength(dest,(pos-lastpos));
  1065. { cast dest here because it is already unified }
  1066. move(lastpos^,char(dest[1]),pos-lastpos);
  1067. end
  1068. else
  1069. begin
  1070. oldlength:=Length(dest);
  1071. SetLength(dest,oldlength+(pos-lastpos));
  1072. move(lastpos^,char(dest[oldlength+1]),pos-lastpos);
  1073. end;
  1074. first:=false;
  1075. end;
  1076. { found }
  1077. inc(Result);
  1078. dest:=dest+newstr;
  1079. inc(pos,len);
  1080. lastpos:=pos;
  1081. end;
  1082. { copy remainder }
  1083. len:=strlen(pos);
  1084. if first then
  1085. begin
  1086. SetLength(dest,len);
  1087. move(pos^,char(dest[length(dest)+1]),len);
  1088. end
  1089. else
  1090. begin
  1091. oldlength:=Length(dest);
  1092. SetLength(dest,oldlength+len);
  1093. move(pos^,char(dest[oldlength+1]),len);
  1094. end
  1095. end;
  1096. function RegExprEscapeStr (const S : string) : string;
  1097. var
  1098. i, len : integer;
  1099. s1: string;
  1100. begin
  1101. RegExprEscapeStr:= '';
  1102. s1:='';
  1103. if (S = '') then
  1104. exit;
  1105. len := Length (S);
  1106. for i := 1 to len do
  1107. begin
  1108. if (S [i] in ['(','|', '.', '*', '?', '^', '$', '-', '[', '{', '}', ']', ')', '\']) then
  1109. begin
  1110. s1 := s1 + '\';
  1111. end;
  1112. s1 := s1 + S[i];
  1113. end;
  1114. RegExprEscapeStr:=s1;
  1115. end;
  1116. begin
  1117. cs_nonwordchars:=cs_allchars-cs_wordchars;
  1118. cs_nondigits:=cs_allchars-cs_digits;
  1119. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  1120. end.