regexpr.pp 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207
  1. { $DEFINE DEBUG}
  2. {
  3. This unit implements basic regular expression support
  4. This file is part of the Free Pascal run time library.
  5. Copyright (c) 2000-2006 by Florian Klaempfland Carl Eric Codere
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {.$ define DEBUG}
  13. (*
  14. - newline handling (uses all known formats of ASCII, #10,#13,#13#10 and #$85
  15. TODO:
  16. - correct backtracking, for example in (...)*
  17. - full | support (currently requires to put all items with | operator
  18. between parenthesis (in a group) to take care over order priority.
  19. Therefore the following would work: (foo)|(nofoo) but not
  20. foo|nofoo
  21. - getting substrings and using substrings with \1 etc.
  22. - do we treat several multiline characters in a row as a single
  23. newline character for $ and ^?
  24. *)
  25. {$IFDEF FPC}
  26. {$mode objfpc}
  27. {$ENDIF}
  28. {** @abstract(Regular expression unit)
  29. This unit implements a basic regular expression parser that mostly conforms
  30. to the POSIX extended-regular expression syntax. It also supports standard
  31. escape characters for patterns (as defined in PERL).
  32. }
  33. unit regexpr;
  34. interface
  35. { the following declarions are only in the interface because }
  36. { some procedures return pregexprentry but programs which }
  37. { use this unit shouldn't access this data structures }
  38. type
  39. tcharset = set of char;
  40. tregexprentrytype = (ret_charset,ret_or,
  41. ret_illegalend,ret_backtrace,ret_startline,
  42. ret_endline,ret_pattern);
  43. pregexprentry = ^tregexprentry;
  44. tregexprentry = record
  45. next,nextdestroy : pregexprentry;
  46. case typ : tregexprentrytype of
  47. ret_charset : (chars : tcharset; elsepath : pregexprentry);
  48. {** This is a complete pattern path ()+ , ()* or ()?, n,m }
  49. ret_pattern: (pattern: pregexprentry; minoccurs: integer; maxoccurs: integer; alternative : pregexprentry);
  50. end;
  51. tregexprflag = (
  52. ref_singleline,
  53. {** This indicates that a start of line is either the
  54. start of the pattern or a linebreak. }
  55. ref_multiline,
  56. {** The match will be done in a case-insensitive way
  57. according to US-ASCII character set. }
  58. ref_caseinsensitive);
  59. tregexprflags = set of tregexprflag;
  60. TRegExprEngine = record
  61. Data : pregexprentry;
  62. DestroyList : pregexprentry;
  63. Flags : TRegExprFlags;
  64. end;
  65. const
  66. cs_allchars : tcharset = [#0..#255];
  67. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  68. cs_newline : tcharset = [#10];
  69. cs_digits : tcharset = ['0'..'9'];
  70. cs_whitespace : tcharset = [' ',#9];
  71. var
  72. { these are initilized in the init section of the unit }
  73. cs_nonwordchars : tcharset;
  74. cs_nondigits : tcharset;
  75. cs_nonwhitespace : tcharset;
  76. { the following procedures can be used by units basing }
  77. { on the regexpr unit }
  78. {** From a regular expression, compile an encoded version of the regular expression.
  79. @param(regexpr Regular expression to compile)
  80. @param(flags Flags relating to the type of parsing that will occur)
  81. @param(RegExprEngine The actual encoded version of the regular expression)
  82. @returns(true if success, otherwise syntax error in compiling regular expression)
  83. }
  84. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags;var RegExprEngine: TRegExprEngine): boolean;
  85. {$IFDEF FPC}
  86. {** Backward compatibility routine }
  87. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags): TREGExprEngine;
  88. {$ENDIF}
  89. {** Frees all up resources used for the encoded regular expression }
  90. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  91. {** @abstract(Matches a regular expression)
  92. @param(RegExprEngine The actual compiled regular expression to match against)
  93. @param(p The text to search for for a match)
  94. @param(index zero-based index to the start of the match -1 if no match in p)
  95. @param(len length of the match)
  96. @returns(true if there was a match, otherwise false)
  97. }
  98. function RegExprPos(RegExprEngine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  99. function RegExprReplaceAll(RegExprEngine : TRegExprEngine;const src,newstr : ansistring;var dest : ansistring) : sizeint;
  100. { This function Escape known regex chars and place the result on Return. If something went wrong the
  101. function will return false. }
  102. function RegExprEscapeStr (const S : string) : string;
  103. implementation
  104. {$ifdef DEBUG}
  105. procedure writecharset(c : tcharset);
  106. var
  107. b : byte;
  108. begin
  109. for b:=20 to 255 do
  110. if chr(b) in c then
  111. write(chr(b));
  112. writeln;
  113. end;
  114. const
  115. typ2str : array[tregexprentrytype] of string =
  116. (
  117. 'ret_charset',
  118. 'ret_or',
  119. 'ret_illegalend',
  120. 'ret_backtrace',
  121. 'ret_startline',
  122. 'ret_endline',
  123. 'ret_pattern'
  124. );
  125. { Dumps all the next elements of a tree }
  126. procedure dumptree(space: string; regentry: pregexprentry);
  127. begin
  128. while assigned(regentry) do
  129. begin
  130. WriteLn(space+'------- Node Type ',typ2str[regentry^.typ]);
  131. if (regentry^.typ = ret_charset) then
  132. WriteCharSet(regentry^.chars);
  133. { dump embedded pattern information }
  134. if regentry^.typ = ret_pattern then
  135. begin
  136. dumptree(space+#9,regentry^.pattern);
  137. WriteLn(space+#9,' --- Alternative nodes ');
  138. if assigned(regentry^.alternative) then
  139. dumptree(space+#9#9,regentry^.alternative);
  140. end;
  141. if regentry^.typ = ret_startline then
  142. dumptree(space+#9,regentry^.pattern);
  143. regentry:=regentry^.next;
  144. end;
  145. end;
  146. {$endif DEBUG}
  147. {** Determines the length of a pattern, including sub-patterns.
  148. It goes through the nodes and returns the pattern length
  149. between the two, using minOccurs as required.
  150. Called recursively.
  151. }
  152. function patlength(hp1: pregexprentry): integer;
  153. var
  154. count: integer;
  155. hp: pregexprentry;
  156. begin
  157. count:=0;
  158. if hp1^.typ=ret_pattern then
  159. hp:=hp1^.pattern
  160. else
  161. hp:=hp1;
  162. { now go through all chars and get the length
  163. does not currently take care of embedded patterns
  164. }
  165. while assigned(hp) do
  166. begin
  167. if hp^.typ = ret_pattern then
  168. begin
  169. inc(count,patlength(hp));
  170. end
  171. else
  172. if hp^.typ = ret_charset then
  173. inc(count);
  174. hp:=hp^.next;
  175. end;
  176. if hp1^.typ=ret_pattern then
  177. begin
  178. count:=hp1^.minOccurs*count;
  179. end;
  180. patlength:=count;
  181. end;
  182. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags; var RegExprEngine:TRegExprEngine) : boolean;
  183. var
  184. first : pregexprentry;
  185. procedure doregister(p : pregexprentry);
  186. begin
  187. p^.nextdestroy:=first;
  188. first:=p;
  189. end;
  190. var
  191. currentpos : pchar;
  192. error : boolean;
  193. procedure readchars(var chars: tcharset);
  194. var
  195. c1 : char;
  196. begin
  197. chars:=[];
  198. case currentpos^ of
  199. #0:
  200. exit;
  201. '.':
  202. begin
  203. inc(currentpos);
  204. chars:=cs_allchars-cs_newline;
  205. end;
  206. '\':
  207. begin
  208. inc(currentpos);
  209. case currentpos^ of
  210. #0:
  211. begin
  212. error:=true;
  213. exit;
  214. end;
  215. 't':
  216. begin
  217. inc(currentpos);
  218. chars:=[#9];
  219. end;
  220. 'n':
  221. begin
  222. inc(currentpos);
  223. chars:=[#10];
  224. end;
  225. 'r':
  226. begin
  227. inc(currentpos);
  228. chars:=[#13];
  229. end;
  230. 'd':
  231. begin
  232. inc(currentpos);
  233. chars:=cs_digits;
  234. end;
  235. 'D':
  236. begin
  237. inc(currentpos);
  238. chars:=cs_nondigits;
  239. end;
  240. 's':
  241. begin
  242. inc(currentpos);
  243. chars:=cs_whitespace;
  244. end;
  245. 'S':
  246. begin
  247. inc(currentpos);
  248. chars:=cs_nonwhitespace;
  249. end;
  250. 'w':
  251. begin
  252. inc(currentpos);
  253. chars:=cs_wordchars;
  254. end;
  255. 'W':
  256. begin
  257. inc(currentpos);
  258. chars:=cs_nonwordchars;
  259. end;
  260. 'f' :
  261. begin
  262. inc(currentpos);
  263. chars:= [#12];
  264. end;
  265. 'a' :
  266. begin
  267. inc(currentpos);
  268. chars:= [#7];
  269. end;
  270. else
  271. begin { Some basic escaping...}
  272. chars := [currentpos^];
  273. inc (currentpos);
  274. {error:=true;
  275. exit;}
  276. end;
  277. end;
  278. end;
  279. else
  280. begin
  281. if ref_caseinsensitive in flags then
  282. c1:=upcase(currentpos^)
  283. else
  284. c1:=currentpos^;
  285. inc(currentpos);
  286. if currentpos^='-' then
  287. begin
  288. inc(currentpos);
  289. if currentpos^=#0 then
  290. begin
  291. error:=true;
  292. exit;
  293. end;
  294. if ref_caseinsensitive in flags then
  295. chars:=[c1..upcase(currentpos^)]
  296. else
  297. chars:=[c1..currentpos^];
  298. inc(currentpos);
  299. end
  300. else
  301. chars:=[c1];
  302. end;
  303. end;
  304. end;
  305. procedure readcharset(var charset: tcharset);
  306. var
  307. chars: tcharset;
  308. begin
  309. charset:=[];
  310. case currentpos^ of
  311. #0:
  312. exit;
  313. '[':
  314. begin
  315. inc(currentpos);
  316. while currentpos^<>']' do
  317. begin
  318. if currentpos^='^' then
  319. begin
  320. inc(currentpos);
  321. readchars(chars);
  322. charset:=charset+(cs_allchars-chars);
  323. end
  324. else
  325. begin
  326. readchars(chars);
  327. charset:=charset+chars;
  328. end;
  329. if error or (currentpos^=#0) then
  330. begin
  331. error:=true;
  332. exit;
  333. end;
  334. end;
  335. inc(currentpos);
  336. end;
  337. '^':
  338. begin
  339. inc(currentpos);
  340. readchars(chars);
  341. charset:=cs_allchars-chars;
  342. end;
  343. else
  344. begin
  345. readchars(chars);
  346. charset:=chars;
  347. end;
  348. end;
  349. end;
  350. (* takes care of parsing the {n}, {n,} and {n,m} regular expression
  351. elements. In case of error, sets error to true and returns false,
  352. otherwise returns true and set minoccurs and maxoccurs accordingly
  353. (-1 if not present). *)
  354. function parseoccurences(var currentpos: pchar; var minoccurs,maxoccurs: integer): boolean;
  355. var
  356. minOccursString: string;
  357. maxOccursString: string;
  358. begin
  359. parseoccurences:=false;
  360. minOccurs:=-1;
  361. maxOccurs:=-1;
  362. inc(currentpos);
  363. minOccursString:='';
  364. if currentPos^ = #0 then
  365. begin
  366. error:=true;
  367. exit;
  368. end;
  369. while (currentpos^<>#0) and (currentpos^ in ['0'..'9']) do
  370. begin
  371. minOccursString:=minOccursString+currentPos^;
  372. inc(currentpos);
  373. end;
  374. if length(minOccursString) = 0 then
  375. begin
  376. error:=true;
  377. exit;
  378. end;
  379. Val(minOccursString,minOccurs);
  380. { possible cases here: commad or end bracket }
  381. if currentpos^= '}' then
  382. begin
  383. inc(currentpos);
  384. maxOccurs:=minOccurs;
  385. parseoccurences:=true;
  386. exit;
  387. end;
  388. if currentpos^= ',' then
  389. begin
  390. maxOccursString:='';
  391. inc(currentpos);
  392. while (currentpos^<>#0) and (currentpos^ in ['0'..'9']) do
  393. begin
  394. maxOccursString:=maxOccursString+currentPos^;
  395. inc(currentpos);
  396. end;
  397. if currentpos^= '}' then
  398. begin
  399. { If the length of the string is zero, then there is
  400. no upper bound. }
  401. if length(maxOccursString) > 0 then
  402. Val(maxOccursString,maxOccurs)
  403. else
  404. maxOccurs:=high(integer);
  405. inc(currentpos);
  406. parseoccurences:=true;
  407. exit;
  408. end;
  409. end;
  410. error:=true;
  411. end;
  412. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  413. var
  414. hp : pregexprentry;
  415. minOccurs,maxOccurs: integer;
  416. hp3: pregexprentry;
  417. cs : tcharset;
  418. chaining : ^pregexprentry;
  419. begin
  420. chaining:=nil;
  421. parseregexpr:=nil;
  422. elsepath:=nil;
  423. if error then
  424. exit;
  425. { this dummy allows us to redirect the elsepath later }
  426. { new(ep);
  427. doregister(ep);
  428. ep^.typ:=ret_charset;
  429. ep^.chars:=[];
  430. ep^.elsepath:=elsepath;
  431. elsepath:=ep;}
  432. while true do
  433. begin
  434. if error then
  435. exit;
  436. case currentpos^ of
  437. '(':
  438. begin
  439. inc(currentpos);
  440. hp:=parseregexpr(nil,nil);
  441. { Special characters after the bracket }
  442. if error then
  443. exit;
  444. if currentpos^<>')' then
  445. begin
  446. error:=true;
  447. exit;
  448. end;
  449. inc(currentpos);
  450. case currentpos^ of
  451. '*':
  452. begin
  453. inc(currentpos);
  454. new(hp3);
  455. doregister(hp3);
  456. hp3^.typ:=ret_pattern;
  457. hp3^.alternative:=nil;
  458. hp3^.pattern:=hp;
  459. hp3^.elsepath:=elsepath;
  460. hp3^.minoccurs:=0;
  461. hp3^.maxoccurs:=high(integer);
  462. hp3^.next:=nil;
  463. if assigned(chaining) then
  464. chaining^:=hp3
  465. else
  466. parseregexpr:=hp3;
  467. chaining:=@hp3^.next;
  468. end;
  469. '+':
  470. begin
  471. inc(currentpos);
  472. new(hp3);
  473. doregister(hp3);
  474. hp3^.typ:=ret_pattern;
  475. hp3^.alternative:=nil;
  476. hp3^.pattern:=hp;
  477. hp3^.elsepath:=elsepath;
  478. hp3^.minoccurs:=1;
  479. hp3^.maxoccurs:=high(integer);
  480. hp3^.next:=nil;
  481. if assigned(chaining) then
  482. chaining^:=hp3
  483. else
  484. parseregexpr:=hp3;
  485. chaining:=@hp3^.next;
  486. end;
  487. '?':
  488. begin
  489. inc(currentpos);
  490. new(hp3);
  491. doregister(hp3);
  492. hp3^.typ:=ret_pattern;
  493. hp3^.alternative:=nil;
  494. hp3^.pattern:=hp;
  495. hp3^.elsepath:=elsepath;
  496. hp3^.minoccurs:=0;
  497. hp3^.maxoccurs:=1;
  498. hp3^.next:=nil;
  499. if assigned(chaining) then
  500. chaining^:=hp3
  501. else
  502. parseregexpr:=hp3;
  503. chaining:=@hp3^.next;
  504. end;
  505. '{':
  506. begin
  507. if not parseOccurences(currentPos,minOccurs,maxOccurs) then
  508. exit;
  509. inc(currentpos);
  510. new(hp3);
  511. doregister(hp3);
  512. hp3^.typ:=ret_pattern;
  513. hp3^.alternative:=nil;
  514. hp3^.pattern:=hp;
  515. hp3^.elsepath:=elsepath;
  516. hp3^.minoccurs:=minOccurs;
  517. hp3^.maxoccurs:=maxOccurs;
  518. hp3^.next:=nil;
  519. if assigned(chaining) then
  520. chaining^:=hp3
  521. else
  522. parseregexpr:=hp3;
  523. chaining:=@hp3^.next;
  524. end;
  525. else
  526. begin
  527. { go to end of this list - always the
  528. last next used }
  529. (*
  530. hp3:=hp;
  531. while assigned(hp3^.next) do
  532. begin
  533. hp3:=hp3^.next;
  534. end;
  535. if assigned(chaining) then
  536. chaining^:=hp
  537. else
  538. parseregexpr:=hp;
  539. chaining:=@hp3^.next;*)
  540. new(hp3);
  541. doregister(hp3);
  542. hp3^.typ:=ret_pattern;
  543. hp3^.alternative:=nil;
  544. hp3^.pattern:=hp;
  545. hp3^.elsepath:=elsepath;
  546. hp3^.minoccurs:=1;
  547. hp3^.maxoccurs:=1;
  548. hp3^.next:=nil;
  549. if assigned(chaining) then
  550. chaining^:=hp3
  551. else
  552. parseregexpr:=hp3;
  553. chaining:=@hp3^.next;
  554. end;
  555. end;
  556. end;
  557. { This is only partially implemented currently, as the terms before
  558. the | character must be grouped together with parenthesis, which
  559. is also compatible with other regular expressions.
  560. }
  561. '|':
  562. begin
  563. {$ifdef DEBUG}
  564. writeln('Creating or entry');
  565. {$endif DEBUG}
  566. if (not assigned (hp3)) then
  567. begin
  568. error:=true;
  569. exit;
  570. end;
  571. if (hp3^.typ <> ret_pattern) then
  572. begin
  573. error:=true;
  574. exit;
  575. end;
  576. while currentpos^='|' do
  577. begin
  578. inc(currentpos);
  579. if currentpos^=#0 then
  580. begin
  581. error:=true;
  582. exit;
  583. end;
  584. { always put the longest pattern first, so
  585. swap the trees as necessary.
  586. }
  587. hp := parseregexpr (next, elsepath);
  588. if patlength(hp) > patlength(hp3^.pattern) then
  589. begin
  590. hp3^.alternative:=hp3^.pattern;
  591. hp3^.pattern:=hp;
  592. end
  593. else
  594. hp3^.alternative:=hp;
  595. end;
  596. end;
  597. ')':
  598. exit;
  599. '^':
  600. begin
  601. inc(currentpos);
  602. hp:=parseregexpr(nil,nil);
  603. { Special characters after the bracket }
  604. if error then
  605. exit;
  606. new(hp3);
  607. doregister(hp3);
  608. hp3^.typ:=ret_startline;
  609. hp3^.pattern:=hp;
  610. hp3^.elsepath:=elsepath;
  611. hp3^.next:=nil;
  612. if assigned(chaining) then
  613. chaining^:=hp3
  614. else
  615. parseregexpr:=hp3;
  616. chaining:=@hp3^.next;
  617. end;
  618. '$':
  619. begin
  620. inc(currentpos);
  621. new(hp);
  622. doregister(hp);
  623. hp^.typ:=ret_endline;
  624. hp^.elsepath:=elsepath;
  625. hp^.next:=nil;
  626. if assigned(chaining) then
  627. chaining^:=hp
  628. else
  629. parseregexpr:=hp;
  630. chaining:=@hp^.next;
  631. end;
  632. #0:
  633. exit;
  634. else
  635. begin
  636. readcharset(cs);
  637. if error then
  638. exit;
  639. case currentpos^ of
  640. '*':
  641. begin
  642. inc(currentpos);
  643. new(hp);
  644. doregister(hp);
  645. hp^.typ:=ret_charset;
  646. hp^.chars:=cs;
  647. hp^.elsepath:=nil;
  648. hp^.next:=nil;
  649. new(hp3);
  650. doregister(hp3);
  651. hp3^.typ:=ret_pattern;
  652. hp3^.alternative:=nil;
  653. hp3^.pattern:=hp;
  654. hp3^.elsepath:=elsepath;
  655. hp3^.minoccurs:=0;
  656. hp3^.maxoccurs:=high(integer);
  657. hp3^.next:=nil;
  658. if assigned(chaining) then
  659. chaining^:=hp3
  660. else
  661. parseregexpr:=hp3;
  662. chaining:=@hp3^.next;
  663. end;
  664. '+':
  665. begin
  666. inc(currentpos);
  667. new(hp);
  668. doregister(hp);
  669. hp^.typ:=ret_charset;
  670. hp^.chars:=cs;
  671. hp^.elsepath:=nil;
  672. hp^.next:=nil;
  673. new(hp3);
  674. doregister(hp3);
  675. hp3^.typ:=ret_pattern;
  676. hp3^.alternative:=nil;
  677. hp3^.pattern:=hp;
  678. hp3^.elsepath:=elsepath;
  679. hp3^.minoccurs:=1;
  680. hp3^.maxoccurs:=high(integer);
  681. hp3^.next:=nil;
  682. if assigned(chaining) then
  683. chaining^:=hp3
  684. else
  685. parseregexpr:=hp3;
  686. chaining:=@hp3^.next;
  687. end;
  688. '?':
  689. begin
  690. inc(currentpos);
  691. new(hp);
  692. doregister(hp);
  693. hp^.typ:=ret_charset;
  694. hp^.chars:=cs;
  695. hp^.elsepath:=nil;
  696. hp^.next:=nil;
  697. new(hp3);
  698. doregister(hp3);
  699. hp3^.typ:=ret_pattern;
  700. hp3^.pattern:=hp;
  701. hp3^.alternative:=nil;
  702. hp3^.elsepath:=elsepath;
  703. hp3^.minoccurs:=0;
  704. hp3^.maxoccurs:=1;
  705. hp3^.next:=nil;
  706. if assigned(chaining) then
  707. chaining^:=hp3
  708. else
  709. parseregexpr:=hp3;
  710. chaining:=@hp3^.next;
  711. end;
  712. '{':
  713. begin
  714. if not parseOccurences(currentPos,minOccurs,maxOccurs) then
  715. exit;
  716. new(hp);
  717. doregister(hp);
  718. hp^.typ:=ret_charset;
  719. hp^.chars:=cs;
  720. hp^.elsepath:=nil;
  721. hp^.next:=nil;
  722. new(hp3);
  723. doregister(hp3);
  724. hp3^.typ:=ret_pattern;
  725. hp3^.alternative:=nil;
  726. hp3^.pattern:=hp;
  727. hp3^.elsepath:=elsepath;
  728. hp3^.minoccurs:=minOccurs;
  729. hp3^.maxoccurs:=maxOccurs;
  730. hp3^.next:=nil;
  731. if assigned(chaining) then
  732. chaining^:=hp3
  733. else
  734. parseregexpr:=hp3;
  735. chaining:=@hp3^.next;
  736. end;
  737. else
  738. { Normal character }
  739. begin
  740. new(hp);
  741. doregister(hp);
  742. hp^.typ:=ret_charset;
  743. hp^.chars:=cs;
  744. hp^.elsepath:=elsepath;
  745. hp^.next:=next;
  746. if assigned(chaining) then
  747. chaining^:=hp
  748. else
  749. parseregexpr:=hp;
  750. chaining:=@hp^.next;
  751. continue;
  752. end;
  753. { This was a pattern }
  754. end; { END CASE }
  755. end;
  756. end;
  757. end;
  758. end;
  759. var
  760. endp : pregexprentry;
  761. begin
  762. GenerateRegExprEngine:=false;
  763. RegExprEngine.Data:=nil;
  764. RegExprEngine.DestroyList:=nil;
  765. if regexpr=nil then
  766. exit;
  767. first:=nil;
  768. if (ref_singleline in flags) and (ref_multiline in flags) then
  769. exit;
  770. currentpos:=regexpr;
  771. GenerateRegExprEngine:=true;
  772. error:=false;
  773. new(endp);
  774. doregister(endp);
  775. endp^.typ:=ret_illegalend;
  776. RegExprEngine.flags:=flags;
  777. RegExprEngine.Data:=parseregexpr(nil,endp);
  778. {$IFDEF DEBUG}
  779. writeln('========== Generating tree ============');
  780. dumptree('',RegExprEngine.Data);
  781. {$ENDIF}
  782. RegExprEngine.DestroyList:=first;
  783. if error or (currentpos^<>#0) then
  784. begin
  785. GenerateRegExprEngine:=false;
  786. DestroyRegExprEngine(RegExprEngine);
  787. end;
  788. end;
  789. {$IFDEF FPC}
  790. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags): TREGExprEngine;
  791. var
  792. r: TRegExprEngine;
  793. begin
  794. GenerateRegExprEngine(regexpr,flags,r);
  795. GenerateRegExprEngine:=r;
  796. end;
  797. {$ENDIF}
  798. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  799. var
  800. hp : pregexprentry;
  801. begin
  802. hp:=regexpr.DestroyList;
  803. while assigned(hp) do
  804. begin
  805. regexpr.DestroyList:=hp^.nextdestroy;
  806. dispose(hp);
  807. hp:=regexpr.DestroyList;
  808. end;
  809. regexpr.Data:=nil;
  810. regexpr.DestroyList:=nil;
  811. end;
  812. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  813. var
  814. lastpos : pchar;
  815. firstpos: pchar;
  816. { Does the actual search of the data - return true if the term was found }
  817. function dosearch(regexprentry : pregexprentry;pos : pchar) : boolean;
  818. var
  819. found: boolean;
  820. checkvalue: boolean;
  821. savedpos: pchar;
  822. counter: word;
  823. begin
  824. dosearch:=false;
  825. while true do
  826. begin
  827. {$IFDEF Debug}
  828. writeln('Entering ',typ2str[regexprentry^.typ]);
  829. writeln('Pattern length ',patlength(regexprentry));
  830. {$ENDIF Debug}
  831. case regexprentry^.typ of
  832. ret_endline:
  833. begin
  834. { automatically a match! }
  835. if pos^ = #0 then
  836. begin
  837. dosearch:=true;
  838. exit;
  839. end;
  840. if ref_multiline in regexprengine.flags then
  841. begin
  842. { Supports DOS/Commodore/UNIX/IBM Mainframe line formats }
  843. { avoid reading invalid memory also }
  844. if (pos^=#13) and ((pos+1)^=#10) then
  845. begin
  846. regexprentry:=regexprentry^.next;
  847. end
  848. else
  849. if (pos^=#$85) or (pos^=#10) or ((pos^=#13) and ((pos-1) >= firstpos) and ((pos-1)^ <> #10)) then
  850. begin
  851. regexprentry:=regexprentry^.next;
  852. end
  853. else
  854. begin
  855. dosearch:=false;
  856. exit;
  857. end;
  858. end
  859. else
  860. exit;
  861. end;
  862. ret_pattern:
  863. begin
  864. found:=false;
  865. { Take care of occurences here }
  866. savedpos:=pos;
  867. counter:=0;
  868. repeat
  869. found:=dosearch(regexprentry^.pattern,pos);
  870. if not found then
  871. break;
  872. pos:=lastpos;
  873. inc(counter);
  874. until (not found) or (counter >= regexprentry^.maxoccurs) or (pos^= #0);
  875. if counter = 0 then
  876. begin
  877. { If there was no occurence and the minimum occurence is > 0 then
  878. problem.
  879. }
  880. if (regexprentry^.minoccurs > 0) then
  881. begin
  882. dosearch:=false;
  883. { verify alternative path as required }
  884. if assigned(regexprentry^.alternative) then
  885. begin
  886. dosearch:=dosearch(regexprentry^.alternative,savedpos);
  887. exit;
  888. end;
  889. exit;
  890. end;
  891. dosearch:=true;
  892. lastpos:=savedpos;
  893. end
  894. else
  895. { found }
  896. begin
  897. { Possible choices :
  898. - found and (counter >= minOccurences) and (counter =< maxOccurences) = true
  899. - found and (counter < minOccurences) or (counter > maxOccurences) = false
  900. }
  901. if (counter < regexprentry^.minoccurs) or (counter > regexprentry^.maxoccurs) then
  902. begin
  903. dosearch:=false;
  904. exit;
  905. end;
  906. dosearch:=true;
  907. { if all matches were found, and the current position
  908. points to zero (processed all characters) }
  909. if pos^=#0 then
  910. begin
  911. dosearch:=true;
  912. exit;
  913. end;
  914. end;
  915. { If we are that means the matches were valid, go to next element to match
  916. }
  917. regexprentry:=regexprentry^.next;
  918. if (counter = 0) and not assigned(regexprentry) then
  919. exit;
  920. end;
  921. ret_startline:
  922. begin
  923. checkvalue:=pos=firstpos;
  924. if ref_multiline in regexprengine.flags then
  925. begin
  926. { Supports DOS/Commodore/UNIX/IBM Mainframe line formats }
  927. { avoid reading invalid memory also }
  928. if
  929. (
  930. ((pos-1) >= firstpos) and ((pos-1)^=#$85)
  931. )
  932. or
  933. (
  934. ((pos-1) >= firstpos) and ((pos-1)^=#10)
  935. )
  936. or
  937. (
  938. ((pos-1) >= firstpos) and ((pos-1)^=#13) and
  939. ((pos)^ <> #10)
  940. )
  941. then
  942. begin
  943. checkvalue:=true;
  944. end;
  945. end;
  946. if checkvalue then
  947. begin
  948. dosearch:=dosearch(regexprentry^.pattern,pos);
  949. regexprentry:=regexprentry^.next;
  950. if not dosearch then
  951. exit;
  952. pos:=lastpos;
  953. end
  954. else
  955. begin
  956. dosearch:=false;
  957. exit;
  958. end;
  959. end;
  960. ret_charset:
  961. begin
  962. if (pos^ in regexprentry^.chars) or
  963. ((ref_caseinsensitive in regexprengine.flags) and
  964. (upcase(pos^) in regexprentry^.chars)) then
  965. begin
  966. {$ifdef DEBUG}
  967. writeln('Found matching: ',pos^);
  968. {$endif DEBUG}
  969. regexprentry:=regexprentry^.next;
  970. inc(pos);
  971. end
  972. else
  973. begin
  974. {$ifdef DEBUG}
  975. writeln('Found unmatching: ',pos^);
  976. {$endif DEBUG}
  977. exit;
  978. end;
  979. end;
  980. ret_backtrace:
  981. begin
  982. {$ifdef DEBUG}
  983. writeln('Starting backtrace');
  984. {$endif DEBUG}
  985. if dosearch(regexprentry^.next,pos) then
  986. begin
  987. dosearch:=true;
  988. exit;
  989. end
  990. else if dosearch(regexprentry^.elsepath,pos) then
  991. begin
  992. dosearch:=true;
  993. exit;
  994. end
  995. else
  996. exit;
  997. end;
  998. end;
  999. lastpos:=pos;
  1000. if regexprentry=nil then
  1001. begin
  1002. dosearch:=true;
  1003. exit;
  1004. end;
  1005. if regexprentry^.typ=ret_illegalend then
  1006. exit;
  1007. { end of string, and we were expecting an end of string }
  1008. if (pos^=#0) and (regexprentry^.typ = ret_endline) and
  1009. (not assigned(regexprentry^.next)) then
  1010. begin
  1011. dosearch:=true;
  1012. exit;
  1013. end;
  1014. if pos^=#0 then
  1015. exit;
  1016. end;
  1017. end;
  1018. begin
  1019. RegExprPos:=false;
  1020. index:=0;
  1021. len:=0;
  1022. firstpos:=p;
  1023. if regexprengine.Data=nil then
  1024. exit;
  1025. while p^<>#0 do
  1026. begin
  1027. if dosearch(regexprengine.Data,p) then
  1028. begin
  1029. len:=lastpos-p;
  1030. RegExprPos:=true;
  1031. exit;
  1032. end
  1033. else
  1034. begin
  1035. inc(p);
  1036. inc(index);
  1037. end;
  1038. end;
  1039. index:=-1;
  1040. end;
  1041. function RegExprReplaceAll(RegExprEngine : TRegExprEngine;const src,newstr : ansistring;var dest : ansistring) : sizeint;
  1042. var
  1043. index,len : longint;
  1044. pos,lastpos : pchar;
  1045. first : boolean;
  1046. oldlength : PtrInt;
  1047. begin
  1048. pos:=pchar(src);
  1049. lastpos:=nil;
  1050. first:=true;
  1051. Result:=0;
  1052. { estimate some length }
  1053. SetLength(dest,length(src)+((length(src) div 10)*length(newstr)));
  1054. while RegExprPos(RegExprEngine,pos,index,len) do
  1055. begin
  1056. inc(pos,index);
  1057. if (lastpos = nil) or (pos>lastpos) then
  1058. begin
  1059. if lastpos = nil then lastpos := pchar(src);
  1060. { copy skipped part }
  1061. { because we cheat with SetLength a SetLength(...,0) isn't what we want
  1062. so we've to trick at the first SetLength call
  1063. }
  1064. if first then
  1065. begin
  1066. SetLength(dest,(pos-lastpos));
  1067. { cast dest here because it is already unified }
  1068. move(lastpos^,char(dest[1]),pos-lastpos);
  1069. end
  1070. else
  1071. begin
  1072. oldlength:=Length(dest);
  1073. SetLength(dest,oldlength+(pos-lastpos));
  1074. move(lastpos^,char(dest[oldlength+1]),pos-lastpos);
  1075. end;
  1076. first:=false;
  1077. end;
  1078. { found }
  1079. inc(Result);
  1080. dest:=dest+newstr;
  1081. inc(pos,len);
  1082. lastpos:=pos;
  1083. end;
  1084. { copy remainder }
  1085. len:=strlen(pos);
  1086. if first then
  1087. begin
  1088. SetLength(dest,len);
  1089. move(pos^,char(dest[length(dest)+1]),len);
  1090. end
  1091. else
  1092. begin
  1093. oldlength:=Length(dest);
  1094. SetLength(dest,oldlength+len);
  1095. move(pos^,char(dest[oldlength+1]),len);
  1096. end
  1097. end;
  1098. function RegExprEscapeStr (const S : string) : string;
  1099. var
  1100. i, len : integer;
  1101. s1: string;
  1102. begin
  1103. RegExprEscapeStr:= '';
  1104. s1:='';
  1105. if (S = '') then
  1106. exit;
  1107. len := Length (S);
  1108. for i := 1 to len do
  1109. begin
  1110. if (S [i] in ['(','|', '.', '*', '?', '^', '$', '-', '[', '{', '}', ']', ')', '\']) then
  1111. begin
  1112. s1 := s1 + '\';
  1113. end;
  1114. s1 := s1 + S[i];
  1115. end;
  1116. RegExprEscapeStr:=s1;
  1117. end;
  1118. begin
  1119. cs_nonwordchars:=cs_allchars-cs_wordchars;
  1120. cs_nondigits:=cs_allchars-cs_digits;
  1121. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  1122. end.