regexpr.pp 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171
  1. {
  2. This unit implements basic regular expression support
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000-2006 by Florian Klaempfland Carl Eric Codere
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {.$ define DEBUG}
  12. (*
  13. - newline handling (uses all known formats of ASCII, #10,#13,#13#10 and #$85
  14. TODO:
  15. - correct backtracking, for example in (...)*
  16. - full | support (currently requires to put all items with | operator
  17. between parenthesis (in a group) to take care over order priority.
  18. Therefore the following would work: (foo)|(nofoo) but not
  19. foo|nofoo
  20. - getting substrings and using substrings with \1 etc.
  21. - do we treat several multiline characters in a row as a single
  22. newline character for $ and ^?
  23. *)
  24. {$IFDEF FPC}
  25. {$mode objfpc}
  26. {$ENDIF}
  27. {** @abstract(Regular expression unit)
  28. This unit implements a basic regular expression parser that mostly conforms
  29. to the POSIX extended-regular expression syntax. It also supports standard
  30. escape characters for patterns (as defined in PERL).
  31. }
  32. unit regexpr;
  33. interface
  34. { the following declarions are only in the interface because }
  35. { some procedures return pregexprentry but programs which }
  36. { use this unit shouldn't access this data structures }
  37. type
  38. tcharset = set of char;
  39. tregexprentrytype = (ret_charset,ret_or,
  40. ret_illegalend,ret_backtrace,ret_startline,
  41. ret_endline,ret_pattern);
  42. pregexprentry = ^tregexprentry;
  43. tregexprentry = record
  44. next,nextdestroy : pregexprentry;
  45. case typ : tregexprentrytype of
  46. ret_charset : (chars : tcharset; elsepath : pregexprentry);
  47. {** This is a complete pattern path ()+ , ()* or ()?, n,m }
  48. ret_pattern: (pattern: pregexprentry; minoccurs: integer; maxoccurs: integer; alternative : pregexprentry);
  49. end;
  50. tregexprflag = (
  51. ref_singleline,
  52. {** This indicates that a start of line is either the
  53. start of the pattern or a linebreak. }
  54. ref_multiline,
  55. {** The match will be done in a case-insensitive way
  56. according to US-ASCII character set. }
  57. ref_caseinsensitive);
  58. tregexprflags = set of tregexprflag;
  59. TRegExprEngine = record
  60. Data : pregexprentry;
  61. DestroyList : pregexprentry;
  62. Flags : TRegExprFlags;
  63. end;
  64. const
  65. cs_allchars : tcharset = [#0..#255];
  66. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  67. cs_newline : tcharset = [#10];
  68. cs_digits : tcharset = ['0'..'9'];
  69. cs_whitespace : tcharset = [' ',#9];
  70. var
  71. { these are initilized in the init section of the unit }
  72. cs_nonwordchars : tcharset;
  73. cs_nondigits : tcharset;
  74. cs_nonwhitespace : tcharset;
  75. { the following procedures can be used by units basing }
  76. { on the regexpr unit }
  77. {** From a regular expression, compile an encoded version of the regular expression.
  78. @param(regexpr Regular expression to compile)
  79. @param(flags Flags relating to the type of parsing that will occur)
  80. @param(RegExprEngine The actual encoded version of the regular expression)
  81. @returns(true if success, otherwise syntax error in compiling regular expression)
  82. }
  83. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags;var RegExprEngine: TRegExprEngine): boolean;
  84. {$IFDEF FPC}
  85. {** Backward compatibility routine }
  86. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags): TREGExprEngine;
  87. {$ENDIF}
  88. {** Frees all up resources used for the encoded regular expression }
  89. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  90. {** @abstract(Matches a regular expression)
  91. @param(RegExprEngine The actual compiled regular expression to match against)
  92. @param(p The text to search for for a match)
  93. @param(index zero-based index to the start of the match -1 if no match in p)
  94. @param(len length of the match)
  95. @returns(true if there was a match, otherwise false)
  96. }
  97. function RegExprPos(RegExprEngine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  98. // function RegExprReplace(RegExprEngine : TRegExprEngine;const src,newstr : ansistring;var dest : ansistring) : sizeint;
  99. { This function Escape known regex chars and place the result on Return. If something went wrong the
  100. function will return false. }
  101. function RegExprEscapeStr (const S : string) : string;
  102. implementation
  103. {$ifdef DEBUG}
  104. procedure writecharset(c : tcharset);
  105. var
  106. b : byte;
  107. begin
  108. for b:=20 to 255 do
  109. if chr(b) in c then
  110. write(chr(b));
  111. writeln;
  112. end;
  113. const
  114. typ2str : array[tregexprentrytype] of string =
  115. (
  116. 'ret_charset',
  117. 'ret_or',
  118. 'ret_illegalend',
  119. 'ret_backtrace',
  120. 'ret_startline',
  121. 'ret_endline',
  122. 'ret_pattern'
  123. );
  124. { Dumps all the next elements of a tree }
  125. procedure dumptree(space: string; regentry: pregexprentry);
  126. begin
  127. while assigned(regentry) do
  128. begin
  129. WriteLn(space+'------- Node Type ',typ2str[regentry^.typ]);
  130. if (regentry^.typ = ret_charset) then
  131. WriteCharSet(regentry^.chars);
  132. { dump embedded pattern information }
  133. if regentry^.typ = ret_pattern then
  134. begin
  135. dumptree(space+#9,regentry^.pattern);
  136. WriteLn(space+#9,' --- Alternative nodes ');
  137. if assigned(regentry^.alternative) then
  138. dumptree(space+#9#9,regentry^.alternative);
  139. end;
  140. if regentry^.typ = ret_startline then
  141. dumptree(space+#9,regentry^.pattern);
  142. regentry:=regentry^.next;
  143. end;
  144. end;
  145. {$endif DEBUG}
  146. {** Determines the length of a pattern, including sub-patterns.
  147. It goes through the nodes and returns the pattern length
  148. between the two, using minOccurs as required.
  149. Called recursively.
  150. }
  151. function patlength(hp1: pregexprentry): integer;
  152. var
  153. count: integer;
  154. hp: pregexprentry;
  155. begin
  156. count:=0;
  157. if hp1^.typ=ret_pattern then
  158. hp:=hp1^.pattern
  159. else
  160. hp:=hp1;
  161. { now go through all chars and get the length
  162. does not currently take care of embedded patterns
  163. }
  164. while assigned(hp) do
  165. begin
  166. if hp^.typ = ret_pattern then
  167. begin
  168. inc(count,patlength(hp));
  169. end
  170. else
  171. if hp^.typ = ret_charset then
  172. inc(count);
  173. hp:=hp^.next;
  174. end;
  175. if hp1^.typ=ret_pattern then
  176. begin
  177. count:=hp1^.minOccurs*count;
  178. end;
  179. patlength:=count;
  180. end;
  181. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags; var RegExprEngine:TRegExprEngine) : boolean;
  182. var
  183. first : pregexprentry;
  184. procedure doregister(p : pregexprentry);
  185. begin
  186. p^.nextdestroy:=first;
  187. first:=p;
  188. end;
  189. var
  190. currentpos : pchar;
  191. error : boolean;
  192. procedure readchars(var chars: tcharset);
  193. var
  194. c1 : char;
  195. begin
  196. chars:=[];
  197. case currentpos^ of
  198. #0:
  199. exit;
  200. '.':
  201. begin
  202. inc(currentpos);
  203. chars:=cs_allchars-cs_newline;
  204. end;
  205. '\':
  206. begin
  207. inc(currentpos);
  208. case currentpos^ of
  209. #0:
  210. begin
  211. error:=true;
  212. exit;
  213. end;
  214. 't':
  215. begin
  216. inc(currentpos);
  217. chars:=[#9];
  218. end;
  219. 'n':
  220. begin
  221. inc(currentpos);
  222. chars:=[#10];
  223. end;
  224. 'r':
  225. begin
  226. inc(currentpos);
  227. chars:=[#13];
  228. end;
  229. 'd':
  230. begin
  231. inc(currentpos);
  232. chars:=cs_digits;
  233. end;
  234. 'D':
  235. begin
  236. inc(currentpos);
  237. chars:=cs_nondigits;
  238. end;
  239. 's':
  240. begin
  241. inc(currentpos);
  242. chars:=cs_whitespace;
  243. end;
  244. 'S':
  245. begin
  246. inc(currentpos);
  247. chars:=cs_nonwhitespace;
  248. end;
  249. 'w':
  250. begin
  251. inc(currentpos);
  252. chars:=cs_wordchars;
  253. end;
  254. 'W':
  255. begin
  256. inc(currentpos);
  257. chars:=cs_nonwordchars;
  258. end;
  259. 'f' :
  260. begin
  261. inc(currentpos);
  262. chars:= [#12];
  263. end;
  264. 'a' :
  265. begin
  266. inc(currentpos);
  267. chars:= [#7];
  268. end;
  269. else
  270. begin { Some basic escaping...}
  271. chars := [currentpos^];
  272. inc (currentpos);
  273. {error:=true;
  274. exit;}
  275. end;
  276. end;
  277. end;
  278. else
  279. begin
  280. if ref_caseinsensitive in flags then
  281. c1:=upcase(currentpos^)
  282. else
  283. c1:=currentpos^;
  284. inc(currentpos);
  285. if currentpos^='-' then
  286. begin
  287. inc(currentpos);
  288. if currentpos^=#0 then
  289. begin
  290. error:=true;
  291. exit;
  292. end;
  293. if ref_caseinsensitive in flags then
  294. chars:=[c1..upcase(currentpos^)]
  295. else
  296. chars:=[c1..currentpos^];
  297. inc(currentpos);
  298. end
  299. else
  300. chars:=[c1];
  301. end;
  302. end;
  303. end;
  304. procedure readcharset(var charset: tcharset);
  305. var
  306. chars: tcharset;
  307. begin
  308. charset:=[];
  309. case currentpos^ of
  310. #0:
  311. exit;
  312. '[':
  313. begin
  314. inc(currentpos);
  315. while currentpos^<>']' do
  316. begin
  317. if currentpos^='^' then
  318. begin
  319. inc(currentpos);
  320. readchars(chars);
  321. charset:=charset+(cs_allchars-chars);
  322. end
  323. else
  324. begin
  325. readchars(chars);
  326. charset:=charset+chars;
  327. end;
  328. if error or (currentpos^=#0) then
  329. begin
  330. error:=true;
  331. exit;
  332. end;
  333. end;
  334. inc(currentpos);
  335. end;
  336. '^':
  337. begin
  338. inc(currentpos);
  339. readchars(chars);
  340. charset:=cs_allchars-chars;
  341. end;
  342. else
  343. begin
  344. readchars(chars);
  345. charset:=chars;
  346. end;
  347. end;
  348. end;
  349. (* takes care of parsing the {n}, {n,} and {n,m} regular expression
  350. elements. In case of error, sets error to true and returns false,
  351. otherwise returns true and set minoccurs and maxoccurs accordingly
  352. (-1 if not present). *)
  353. function parseoccurences(var currentpos: pchar; var minoccurs,maxoccurs: integer): boolean;
  354. var
  355. minOccursString: string;
  356. maxOccursString: string;
  357. begin
  358. parseoccurences:=false;
  359. minOccurs:=-1;
  360. maxOccurs:=-1;
  361. inc(currentpos);
  362. minOccursString:='';
  363. if currentPos^ = #0 then
  364. begin
  365. error:=true;
  366. exit;
  367. end;
  368. while (currentpos^<>#0) and (currentpos^ in ['0'..'9']) do
  369. begin
  370. minOccursString:=minOccursString+currentPos^;
  371. inc(currentpos);
  372. end;
  373. if length(minOccursString) = 0 then
  374. begin
  375. error:=true;
  376. exit;
  377. end;
  378. Val(minOccursString,minOccurs);
  379. { possible cases here: commad or end bracket }
  380. if currentpos^= '}' then
  381. begin
  382. inc(currentpos);
  383. maxOccurs:=minOccurs;
  384. parseoccurences:=true;
  385. exit;
  386. end;
  387. if currentpos^= ',' then
  388. begin
  389. maxOccursString:='';
  390. inc(currentpos);
  391. while (currentpos^<>#0) and (currentpos^ in ['0'..'9']) do
  392. begin
  393. maxOccursString:=maxOccursString+currentPos^;
  394. inc(currentpos);
  395. end;
  396. if currentpos^= '}' then
  397. begin
  398. { If the length of the string is zero, then there is
  399. no upper bound. }
  400. if length(maxOccursString) > 0 then
  401. Val(maxOccursString,maxOccurs)
  402. else
  403. maxOccurs:=high(integer);
  404. inc(currentpos);
  405. parseoccurences:=true;
  406. exit;
  407. end;
  408. end;
  409. error:=true;
  410. end;
  411. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  412. var
  413. hp : pregexprentry;
  414. minOccurs,maxOccurs: integer;
  415. hp3: pregexprentry;
  416. cs : tcharset;
  417. chaining : ^pregexprentry;
  418. begin
  419. chaining:=nil;
  420. parseregexpr:=nil;
  421. elsepath:=nil;
  422. if error then
  423. exit;
  424. { this dummy allows us to redirect the elsepath later }
  425. { new(ep);
  426. doregister(ep);
  427. ep^.typ:=ret_charset;
  428. ep^.chars:=[];
  429. ep^.elsepath:=elsepath;
  430. elsepath:=ep;}
  431. while true do
  432. begin
  433. if error then
  434. exit;
  435. case currentpos^ of
  436. '(':
  437. begin
  438. inc(currentpos);
  439. hp:=parseregexpr(nil,nil);
  440. { Special characters after the bracket }
  441. if error then
  442. exit;
  443. if currentpos^<>')' then
  444. begin
  445. error:=true;
  446. exit;
  447. end;
  448. inc(currentpos);
  449. case currentpos^ of
  450. '*':
  451. begin
  452. inc(currentpos);
  453. new(hp3);
  454. doregister(hp3);
  455. hp3^.typ:=ret_pattern;
  456. hp3^.alternative:=nil;
  457. hp3^.pattern:=hp;
  458. hp3^.elsepath:=elsepath;
  459. hp3^.minoccurs:=0;
  460. hp3^.maxoccurs:=high(integer);
  461. hp3^.next:=nil;
  462. if assigned(chaining) then
  463. chaining^:=hp3
  464. else
  465. parseregexpr:=hp3;
  466. chaining:=@hp3^.next;
  467. end;
  468. '+':
  469. begin
  470. inc(currentpos);
  471. new(hp3);
  472. doregister(hp3);
  473. hp3^.typ:=ret_pattern;
  474. hp3^.alternative:=nil;
  475. hp3^.pattern:=hp;
  476. hp3^.elsepath:=elsepath;
  477. hp3^.minoccurs:=1;
  478. hp3^.maxoccurs:=high(integer);
  479. hp3^.next:=nil;
  480. if assigned(chaining) then
  481. chaining^:=hp3
  482. else
  483. parseregexpr:=hp3;
  484. chaining:=@hp3^.next;
  485. end;
  486. '?':
  487. begin
  488. inc(currentpos);
  489. new(hp3);
  490. doregister(hp3);
  491. hp3^.typ:=ret_pattern;
  492. hp3^.alternative:=nil;
  493. hp3^.pattern:=hp;
  494. hp3^.elsepath:=elsepath;
  495. hp3^.minoccurs:=0;
  496. hp3^.maxoccurs:=1;
  497. hp3^.next:=nil;
  498. if assigned(chaining) then
  499. chaining^:=hp3
  500. else
  501. parseregexpr:=hp3;
  502. chaining:=@hp3^.next;
  503. end;
  504. '{':
  505. begin
  506. if not parseOccurences(currentPos,minOccurs,maxOccurs) then
  507. exit;
  508. inc(currentpos);
  509. new(hp3);
  510. doregister(hp3);
  511. hp3^.typ:=ret_pattern;
  512. hp3^.alternative:=nil;
  513. hp3^.pattern:=hp;
  514. hp3^.elsepath:=elsepath;
  515. hp3^.minoccurs:=minOccurs;
  516. hp3^.maxoccurs:=maxOccurs;
  517. hp3^.next:=nil;
  518. if assigned(chaining) then
  519. chaining^:=hp3
  520. else
  521. parseregexpr:=hp3;
  522. chaining:=@hp3^.next;
  523. end;
  524. else
  525. begin
  526. { go to end of this list - always the
  527. last next used }
  528. (*
  529. hp3:=hp;
  530. while assigned(hp3^.next) do
  531. begin
  532. hp3:=hp3^.next;
  533. end;
  534. if assigned(chaining) then
  535. chaining^:=hp
  536. else
  537. parseregexpr:=hp;
  538. chaining:=@hp3^.next;*)
  539. new(hp3);
  540. doregister(hp3);
  541. hp3^.typ:=ret_pattern;
  542. hp3^.alternative:=nil;
  543. hp3^.pattern:=hp;
  544. hp3^.elsepath:=elsepath;
  545. hp3^.minoccurs:=1;
  546. hp3^.maxoccurs:=1;
  547. hp3^.next:=nil;
  548. if assigned(chaining) then
  549. chaining^:=hp3
  550. else
  551. parseregexpr:=hp3;
  552. chaining:=@hp3^.next;
  553. end;
  554. end;
  555. end;
  556. { This is only partially implemented currently, as the terms before
  557. the | character must be grouped together with parenthesis, which
  558. is also compatible with other regular expressions.
  559. }
  560. '|':
  561. begin
  562. {$ifdef DEBUG}
  563. writeln('Creating or entry');
  564. {$endif DEBUG}
  565. if (not assigned (hp3)) then
  566. begin
  567. error:=true;
  568. exit;
  569. end;
  570. if (hp3^.typ <> ret_pattern) then
  571. begin
  572. error:=true;
  573. exit;
  574. end;
  575. while currentpos^='|' do
  576. begin
  577. inc(currentpos);
  578. if currentpos^=#0 then
  579. begin
  580. error:=true;
  581. exit;
  582. end;
  583. { always put the longest pattern first, so
  584. swap the trees as necessary.
  585. }
  586. hp := parseregexpr (next, elsepath);
  587. if patlength(hp) > patlength(hp3^.pattern) then
  588. begin
  589. hp3^.alternative:=hp3^.pattern;
  590. hp3^.pattern:=hp;
  591. end
  592. else
  593. hp3^.alternative:=hp;
  594. end;
  595. end;
  596. ')':
  597. exit;
  598. '^':
  599. begin
  600. inc(currentpos);
  601. hp:=parseregexpr(nil,nil);
  602. { Special characters after the bracket }
  603. if error then
  604. exit;
  605. new(hp3);
  606. doregister(hp3);
  607. hp3^.typ:=ret_startline;
  608. hp3^.pattern:=hp;
  609. hp3^.elsepath:=elsepath;
  610. hp3^.next:=nil;
  611. if assigned(chaining) then
  612. chaining^:=hp3
  613. else
  614. parseregexpr:=hp3;
  615. chaining:=@hp3^.next;
  616. end;
  617. '$':
  618. begin
  619. inc(currentpos);
  620. new(hp);
  621. doregister(hp);
  622. hp^.typ:=ret_endline;
  623. hp^.elsepath:=elsepath;
  624. hp^.next:=nil;
  625. if assigned(chaining) then
  626. chaining^:=hp
  627. else
  628. parseregexpr:=hp;
  629. chaining:=@hp^.next;
  630. end;
  631. #0:
  632. exit;
  633. else
  634. begin
  635. readcharset(cs);
  636. if error then
  637. exit;
  638. case currentpos^ of
  639. '*':
  640. begin
  641. inc(currentpos);
  642. new(hp);
  643. doregister(hp);
  644. hp^.typ:=ret_charset;
  645. hp^.chars:=cs;
  646. hp^.elsepath:=nil;
  647. hp^.next:=nil;
  648. new(hp3);
  649. doregister(hp3);
  650. hp3^.typ:=ret_pattern;
  651. hp3^.alternative:=nil;
  652. hp3^.pattern:=hp;
  653. hp3^.elsepath:=elsepath;
  654. hp3^.minoccurs:=0;
  655. hp3^.maxoccurs:=high(integer);
  656. hp3^.next:=nil;
  657. if assigned(chaining) then
  658. chaining^:=hp3
  659. else
  660. parseregexpr:=hp3;
  661. chaining:=@hp3^.next;
  662. end;
  663. '+':
  664. begin
  665. inc(currentpos);
  666. new(hp);
  667. doregister(hp);
  668. hp^.typ:=ret_charset;
  669. hp^.chars:=cs;
  670. hp^.elsepath:=nil;
  671. hp^.next:=nil;
  672. new(hp3);
  673. doregister(hp3);
  674. hp3^.typ:=ret_pattern;
  675. hp3^.alternative:=nil;
  676. hp3^.pattern:=hp;
  677. hp3^.elsepath:=elsepath;
  678. hp3^.minoccurs:=1;
  679. hp3^.maxoccurs:=high(integer);
  680. hp3^.next:=nil;
  681. if assigned(chaining) then
  682. chaining^:=hp3
  683. else
  684. parseregexpr:=hp3;
  685. chaining:=@hp3^.next;
  686. end;
  687. '?':
  688. begin
  689. inc(currentpos);
  690. new(hp);
  691. doregister(hp);
  692. hp^.typ:=ret_charset;
  693. hp^.chars:=cs;
  694. hp^.elsepath:=nil;
  695. hp^.next:=nil;
  696. new(hp3);
  697. doregister(hp3);
  698. hp3^.typ:=ret_pattern;
  699. hp3^.pattern:=hp;
  700. hp3^.alternative:=nil;
  701. hp3^.elsepath:=elsepath;
  702. hp3^.minoccurs:=0;
  703. hp3^.maxoccurs:=1;
  704. hp3^.next:=nil;
  705. if assigned(chaining) then
  706. chaining^:=hp3
  707. else
  708. parseregexpr:=hp3;
  709. chaining:=@hp3^.next;
  710. end;
  711. '{':
  712. begin
  713. if not parseOccurences(currentPos,minOccurs,maxOccurs) then
  714. exit;
  715. new(hp);
  716. doregister(hp);
  717. hp^.typ:=ret_charset;
  718. hp^.chars:=cs;
  719. hp^.elsepath:=nil;
  720. hp^.next:=nil;
  721. new(hp3);
  722. doregister(hp3);
  723. hp3^.typ:=ret_pattern;
  724. hp3^.alternative:=nil;
  725. hp3^.pattern:=hp;
  726. hp3^.elsepath:=elsepath;
  727. hp3^.minoccurs:=minOccurs;
  728. hp3^.maxoccurs:=maxOccurs;
  729. hp3^.next:=nil;
  730. if assigned(chaining) then
  731. chaining^:=hp3
  732. else
  733. parseregexpr:=hp3;
  734. chaining:=@hp3^.next;
  735. end;
  736. else
  737. { Normal character }
  738. begin
  739. new(hp);
  740. doregister(hp);
  741. hp^.typ:=ret_charset;
  742. hp^.chars:=cs;
  743. hp^.elsepath:=elsepath;
  744. hp^.next:=next;
  745. if assigned(chaining) then
  746. chaining^:=hp
  747. else
  748. parseregexpr:=hp;
  749. chaining:=@hp^.next;
  750. continue;
  751. end;
  752. { This was a pattern }
  753. end; { END CASE }
  754. end;
  755. end;
  756. end;
  757. end;
  758. var
  759. endp : pregexprentry;
  760. begin
  761. GenerateRegExprEngine:=false;
  762. RegExprEngine.Data:=nil;
  763. RegExprEngine.DestroyList:=nil;
  764. if regexpr=nil then
  765. exit;
  766. first:=nil;
  767. if (ref_singleline in flags) and (ref_multiline in flags) then
  768. exit;
  769. currentpos:=regexpr;
  770. GenerateRegExprEngine:=true;
  771. error:=false;
  772. new(endp);
  773. doregister(endp);
  774. endp^.typ:=ret_illegalend;
  775. RegExprEngine.flags:=flags;
  776. RegExprEngine.Data:=parseregexpr(nil,endp);
  777. {$IFDEF DEBUG}
  778. writeln('========== Generating tree ============');
  779. dumptree('',RegExprEngine.Data);
  780. {$ENDIF}
  781. RegExprEngine.DestroyList:=first;
  782. if error or (currentpos^<>#0) then
  783. begin
  784. GenerateRegExprEngine:=false;
  785. DestroyRegExprEngine(RegExprEngine);
  786. end;
  787. end;
  788. {$IFDEF FPC}
  789. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags): TREGExprEngine;
  790. var
  791. r: TRegExprEngine;
  792. begin
  793. GenerateRegExprEngine(regexpr,flags,r);
  794. GenerateRegExprEngine:=r;
  795. end;
  796. {$ENDIF}
  797. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  798. var
  799. hp : pregexprentry;
  800. begin
  801. hp:=regexpr.DestroyList;
  802. while assigned(hp) do
  803. begin
  804. regexpr.DestroyList:=hp^.nextdestroy;
  805. dispose(hp);
  806. hp:=regexpr.DestroyList;
  807. end;
  808. regexpr.Data:=nil;
  809. regexpr.DestroyList:=nil;
  810. end;
  811. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  812. var
  813. lastpos : pchar;
  814. firstpos: pchar;
  815. { Does the actual search of the data - return true if the term was found }
  816. function dosearch(regexprentry : pregexprentry;pos : pchar) : boolean;
  817. var
  818. found: boolean;
  819. checkvalue: boolean;
  820. savedpos: pchar;
  821. counter: word;
  822. begin
  823. dosearch:=false;
  824. while true do
  825. begin
  826. {$IFDEF Debug}
  827. writeln('Entering ',typ2str[regexprentry^.typ]);
  828. writeln('Pattern length ',patlength(regexprentry));
  829. {$ENDIF Debug}
  830. case regexprentry^.typ of
  831. ret_endline:
  832. begin
  833. { automatically a match! }
  834. if pos^ = #0 then
  835. begin
  836. dosearch:=true;
  837. exit;
  838. end;
  839. if ref_multiline in regexprengine.flags then
  840. begin
  841. { Supports DOS/Commodore/UNIX/IBM Mainframe line formats }
  842. { avoid reading invalid memory also }
  843. if (pos^=#13) and ((pos+1)^=#10) then
  844. begin
  845. regexprentry:=regexprentry^.next;
  846. end
  847. else
  848. if (pos^=#$85) or (pos^=#10) or ((pos^=#13) and ((pos-1)^ <> #10)) then
  849. begin
  850. regexprentry:=regexprentry^.next;
  851. end
  852. else
  853. begin
  854. dosearch:=false;
  855. lastpos:=savedpos;
  856. exit;
  857. end;
  858. end
  859. else
  860. exit;
  861. end;
  862. ret_pattern:
  863. begin
  864. found:=false;
  865. { Take care of occurences here }
  866. savedpos:=pos;
  867. counter:=0;
  868. repeat
  869. found:=dosearch(regexprentry^.pattern,pos);
  870. if not found then
  871. break;
  872. pos:=lastpos;
  873. inc(counter);
  874. until (not found) or (counter >= regexprentry^.maxoccurs) or (pos^= #0);
  875. if counter = 0 then
  876. begin
  877. { If there was no occurence and the minimum occurence is > 0 then
  878. problem.
  879. }
  880. if (regexprentry^.minoccurs > 0) then
  881. begin
  882. dosearch:=false;
  883. { verify alternative path as required }
  884. if assigned(regexprentry^.alternative) then
  885. begin
  886. dosearch:=dosearch(regexprentry^.alternative,savedpos);
  887. exit;
  888. end;
  889. exit;
  890. end;
  891. dosearch:=true;
  892. lastpos:=savedpos;
  893. end
  894. else
  895. { found }
  896. begin
  897. { Possible choices :
  898. - found and (counter >= minOccurences) and (counter =< maxOccurences) = true
  899. - found and (counter < minOccurences) or (counter > maxOccurences) = false
  900. }
  901. if (counter < regexprentry^.minoccurs) or (counter > regexprentry^.maxoccurs) then
  902. begin
  903. dosearch:=false;
  904. exit;
  905. end;
  906. dosearch:=true;
  907. { if all matches were found, and the current position
  908. points to zero (processed all characters) }
  909. if pos^=#0 then
  910. begin
  911. dosearch:=true;
  912. exit;
  913. end;
  914. end;
  915. { If we are that means the matches were valid, go to next element to match
  916. }
  917. regexprentry:=regexprentry^.next;
  918. if (counter = 0) and not assigned(regexprentry) then
  919. exit;
  920. end;
  921. ret_startline:
  922. begin
  923. checkvalue:=pos=firstpos;
  924. if ref_multiline in regexprengine.flags then
  925. begin
  926. { Supports DOS/Commodore/UNIX/IBM Mainframe line formats }
  927. { avoid reading invalid memory also }
  928. if
  929. (
  930. ((pos-1) >= firstpos) and ((pos-1)^=#$85)
  931. )
  932. or
  933. (
  934. ((pos-1) >= firstpos) and ((pos-1)^=#10)
  935. )
  936. or
  937. (
  938. ((pos-1) >= firstpos) and ((pos-1)^=#13) and
  939. ((pos)^ <> #10)
  940. )
  941. then
  942. begin
  943. checkvalue:=true;
  944. end;
  945. end;
  946. if checkvalue then
  947. begin
  948. dosearch:=dosearch(regexprentry^.pattern,pos);
  949. regexprentry:=regexprentry^.next;
  950. if not dosearch then
  951. exit;
  952. pos:=lastpos;
  953. end
  954. else
  955. begin
  956. dosearch:=false;
  957. exit;
  958. end;
  959. end;
  960. ret_charset:
  961. begin
  962. if (pos^ in regexprentry^.chars) or
  963. ((ref_caseinsensitive in regexprengine.flags) and
  964. (upcase(pos^) in regexprentry^.chars)) then
  965. begin
  966. {$ifdef DEBUG}
  967. writeln('Found matching: ',pos^);
  968. {$endif DEBUG}
  969. regexprentry:=regexprentry^.next;
  970. inc(pos);
  971. end
  972. else
  973. begin
  974. {$ifdef DEBUG}
  975. writeln('Found unmatching: ',pos^);
  976. {$endif DEBUG}
  977. exit;
  978. end;
  979. end;
  980. ret_backtrace:
  981. begin
  982. {$ifdef DEBUG}
  983. writeln('Starting backtrace');
  984. {$endif DEBUG}
  985. if dosearch(regexprentry^.next,pos) then
  986. begin
  987. dosearch:=true;
  988. exit;
  989. end
  990. else if dosearch(regexprentry^.elsepath,pos) then
  991. begin
  992. dosearch:=true;
  993. exit;
  994. end
  995. else
  996. exit;
  997. end;
  998. end;
  999. lastpos:=pos;
  1000. if regexprentry=nil then
  1001. begin
  1002. dosearch:=true;
  1003. exit;
  1004. end;
  1005. if regexprentry^.typ=ret_illegalend then
  1006. exit;
  1007. { end of string, and we were expecting an end of string }
  1008. if (pos^=#0) and (regexprentry^.typ = ret_endline) and
  1009. (not assigned(regexprentry^.next)) then
  1010. begin
  1011. dosearch:=true;
  1012. exit;
  1013. end;
  1014. if pos^=#0 then
  1015. exit;
  1016. end;
  1017. end;
  1018. begin
  1019. RegExprPos:=false;
  1020. index:=0;
  1021. len:=0;
  1022. firstpos:=p;
  1023. if regexprengine.Data=nil then
  1024. exit;
  1025. while p^<>#0 do
  1026. begin
  1027. if dosearch(regexprengine.Data,p) then
  1028. begin
  1029. len:=lastpos-p;
  1030. RegExprPos:=true;
  1031. exit;
  1032. end
  1033. else
  1034. begin
  1035. inc(p);
  1036. inc(index);
  1037. end;
  1038. end;
  1039. index:=-1;
  1040. end;
  1041. {
  1042. function RegExprReplace(RegExprEngine : TRegExprEngine;const src,newstr : ansistring;var dest : ansistring) : sizeint;
  1043. var
  1044. index,len : longint;
  1045. pos,lastpos : pchar;
  1046. begin
  1047. pos:=pchar(src);
  1048. lastpos:=pos;
  1049. { estimate some length }
  1050. SetLength(dest,length(src)+((length(src) div 10)*length(newstr)));
  1051. while RegExprPos(RegExprEngine,pos,index,len) do
  1052. begin
  1053. if pos>lastpos then
  1054. begin
  1055. { cast dest here because it is already unified }
  1056. move(lastpos^,char(dest[length(dest)+1)]),pos-lastpos);
  1057. SetLength(dest,Length(dest)+(pos-lastpos));
  1058. end;
  1059. dest:=dest+newstr;
  1060. inc(pos,len);
  1061. end;
  1062. { copy remainder }
  1063. end;
  1064. }
  1065. function RegExprEscapeStr (const S : string) : string;
  1066. var
  1067. i, len : integer;
  1068. s1: string;
  1069. begin
  1070. RegExprEscapeStr:= '';
  1071. s1:='';
  1072. if (S = '') then
  1073. exit;
  1074. len := Length (S);
  1075. for i := 1 to len do
  1076. begin
  1077. if (S [i] in ['(','|', '.', '*', '?', '^', '$', '-', '[', '{', '}', ']', ')', '\']) then
  1078. begin
  1079. s1 := s1 + '\';
  1080. end;
  1081. s1 := s1 + S[i];
  1082. end;
  1083. RegExprEscapeStr:=s1;
  1084. end;
  1085. begin
  1086. cs_nonwordchars:=cs_allchars-cs_wordchars;
  1087. cs_nondigits:=cs_allchars-cs_digits;
  1088. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  1089. end.