regex.pp 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273
  1. {
  2. This file is part of the Free Pascal packages library.
  3. Copyright (c) 2008 by Joost van der Sluis, member of the
  4. Free Pascal development team
  5. Regexpression parser
  6. This code is based on the examples in the book
  7. 'Tomes of Delphi: Algorithms and Data Structures' by Julian M Bucknall
  8. The code is used with his permission. For an excellent explanation of
  9. this unit, see the book...
  10. See the file COPYING.FPC, included in this distribution,
  11. for details about the copyright.
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. **********************************************************************}
  16. unit Regex;
  17. {$mode Delphi}{$H+}
  18. {$INLINE ON}
  19. interface
  20. {Notes:
  21. these classes parse regular expressions that follow this grammar:
  22. <anchorexpr> ::= <expr> |
  23. '^' <expr> |
  24. <expr> '$' |
  25. '^' <expr> '$'
  26. <expr> ::= <term> |
  27. <term> '|' <expr> - alternation
  28. <term> ::= <factor> |
  29. <factor><term> - concatenation
  30. <factor> ::= <atom> |
  31. <atom> '?' | - zero or one
  32. <atom> '*' | - zero or more
  33. <atom> 'n,m' | - min n, max m (added by Joost)
  34. <atom> '+' - one or more
  35. <atom> ::= <char> |
  36. '.' | - any char
  37. '(' <expr> ') | - parentheses
  38. '[' <charclass> ']' | - normal class
  39. '[^' <charclass> ']' - negated class
  40. <charclass> ::= <charrange> |
  41. <charrange><charclass>
  42. <charrange> ::= <ccchar> |
  43. <ccchar> '-' <ccchar>
  44. <char> ::= <any character except metacharacters> |
  45. '\' <any character at all>
  46. <ccchar> ::= <any character except '-' and ']'> |
  47. '\' <any character at all>
  48. This means that parentheses have maximum precedence, followed
  49. by square brackets, followed by the closure operators,
  50. followed by concatenation, finally followed by alternation.
  51. }
  52. uses
  53. SysUtils,
  54. Classes;
  55. type
  56. TUpcaseFunc = function(aCh : AnsiChar) : AnsiChar;
  57. TNFAMatchType = ( {types of matching performed...}
  58. mtNone, {..no match (an epsilon no-cost move)}
  59. mtAnyChar, {..any character}
  60. mtChar, {..a particular character}
  61. mtClass, {..a character class}
  62. mtDupClass, {..a character class beying referenced}
  63. mtNegClass, {..a negated character class}
  64. mtTerminal, {..the final state--no matching}
  65. mtUnused); {..an unused state--no matching}
  66. TRegexError = ( {error codes for invalid regex strings}
  67. recNone, {..no error}
  68. recSuddenEnd, {..unexpected end of string}
  69. recMetaChar, {..read metacharacter, but needed normal char}
  70. recNoCloseParen, {..expected close paren, but not there}
  71. recExtraChars {..not at end of string after parsing regex}
  72. );
  73. TRegexType = (
  74. rtRegEx,
  75. rtChars,
  76. rtSingleChar
  77. );
  78. PCharSet = ^TCharSet;
  79. TCharSet = set of Char;
  80. { TtdRegexEngine }
  81. TNFAState = record
  82. sdNextState1: integer;
  83. sdNextState2: integer;
  84. sdClass : PCharSet;
  85. sdMatchType : TNFAMatchType;
  86. sdChar : AnsiChar;
  87. end;
  88. { TRegexEngine }
  89. TRegexEngine = class
  90. private
  91. FAnchorEnd : boolean;
  92. FAnchorStart: boolean;
  93. FErrorCode : TRegexError;
  94. FIgnoreCase : boolean;
  95. FMultiLine : boolean;
  96. FPosn : PAnsiChar;
  97. FRegexStr : string;
  98. FStartState : integer;
  99. FStateTable : Array of TNFAState;
  100. FStateCount : integer;
  101. FUpcase : TUpcaseFunc;
  102. // The deque (double-ended queue)
  103. FList : array of integer;
  104. FCapacity : integer;
  105. FHead : integer;
  106. FTail : integer;
  107. FRegexType : TRegexType;
  108. protected
  109. procedure DequeEnqueue(aValue : integer);
  110. procedure DequePush(aValue : integer);
  111. function DequePop : integer;
  112. procedure DequeGrow;
  113. procedure rcSetIgnoreCase(aValue : boolean); virtual;
  114. procedure rcSetRegexStr(const aRegexStr : string); virtual;
  115. procedure rcSetUpcase(aValue : TUpcaseFunc); virtual;
  116. procedure rcSetMultiLine(aValue : Boolean); virtual;
  117. procedure rcClear; virtual;
  118. procedure rcError(aIndex : integer); virtual;
  119. procedure rcLevel1Optimize; virtual;
  120. function rcMatchSubString(const S : string;
  121. StartPosn : integer;
  122. var Len : integer) : boolean; virtual;
  123. function rcAddState(aMatchType : TNFAMatchType;
  124. aChar : AnsiChar;
  125. aCharClass : PCharSet;
  126. aNextState1: integer;
  127. aNextState2: integer) : integer;
  128. function rcSetState(aState : integer;
  129. aNextState1: integer;
  130. aNextState2: integer) : integer;
  131. function rcParseAnchorExpr : integer; virtual;
  132. function rcParseAtom : integer; virtual;
  133. function rcParseCCChar(out EscapeChar : Boolean) : AnsiChar; virtual;
  134. function rcParseChar : integer; virtual;
  135. function rcParseCharClass(aClass : PCharSet) : boolean; virtual;
  136. function rcParseCharRange(aClass : PCharSet) : boolean; virtual;
  137. function rcParseExpr : integer; virtual;
  138. function rcParseFactor : integer; virtual;
  139. function rcParseTerm : integer; virtual;
  140. Function rcReturnEscapeChar : AnsiChar; virtual;
  141. public
  142. procedure WriteTable;
  143. constructor Create(const aRegexStr : string);
  144. destructor Destroy; override;
  145. function Parse(out aErrorPos : integer;
  146. out aErrorCode: TRegexError) : boolean; virtual;
  147. function MatchString(const S : string; out MatchPos : integer; var Offset : integer) : boolean; virtual;
  148. function ReplaceAllString(const src, newstr: ansistring; out DestStr : string): Integer;
  149. property IgnoreCase : boolean
  150. read FIgnoreCase write rcSetIgnoreCase;
  151. property MultiLine : boolean
  152. read FMultiLine write rcSetMultiLine;
  153. property RegexString : string
  154. read FRegexStr write rcSetRegexStr;
  155. property Upcase : TUpcaseFunc
  156. read FUpcase write rcSetUpcase;
  157. end;
  158. Resourcestring
  159. eRegexParseError = 'Error at %d when parsing regular expression';
  160. implementation
  161. uses strutils;
  162. const
  163. MetaCharacters : set of AnsiChar =
  164. ['[', ']', '(', ')', '|', '*', '+', '?', '-', '.',
  165. '^', '$', '{', '}'];
  166. newline : TCharSet = [#10,#13,#$85];
  167. {some handy constants}
  168. UnusedState = -1;
  169. NewFinalState = -2;
  170. CreateNewState = -3;
  171. ErrorState = -4;
  172. MustScan = -5;
  173. cs_allchars : tcharset = [#0..#255];
  174. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  175. cs_newline : tcharset = [#10];
  176. cs_digits : tcharset = ['0'..'9'];
  177. cs_whitespace : tcharset = [' ',#9];
  178. {===Helper routines==================================================}
  179. function SystemUpcase(aCh : AnsiChar) : AnsiChar; far;
  180. begin
  181. Result := System.Upcase(aCh);
  182. end;
  183. {====================================================================}
  184. {===TRegexEngine===================================================}
  185. constructor TRegexEngine.Create(const aRegexStr : string);
  186. begin
  187. inherited Create;
  188. FRegexStr := aRegexStr;
  189. FIgnoreCase := false;
  190. FUpcase := SystemUpcase;
  191. SetLength(FStateTable,64);
  192. FStateCount:=0;
  193. FCapacity:=64;
  194. setlength(FList,FCapacity);
  195. {let's help out the user of the deque by putting the head and
  196. tail pointers in the middle: it's probably more efficient}
  197. FHead := FCapacity div 2;
  198. FTail := FHead;
  199. MultiLine:=False;
  200. end;
  201. {--------}
  202. destructor TRegexEngine.Destroy;
  203. begin
  204. if (FStateTable <> nil) then
  205. rcClear;
  206. inherited Destroy;
  207. end;
  208. {--------}
  209. function TRegexEngine.MatchString(const S : string; out MatchPos : integer; var Offset : integer): boolean;
  210. var
  211. i : integer;
  212. ErrorPos : integer;
  213. ErrorCode : TRegexError;
  214. pc : pchar;
  215. x:integer;
  216. begin
  217. if Offset>length(S) then
  218. begin
  219. Result := False;
  220. MatchPos := 0;
  221. Exit;
  222. end;
  223. {if the regex string hasn't been parsed yet, do so}
  224. if (FStateCount = 0) then begin
  225. if not Parse(ErrorPos, ErrorCode) then
  226. rcError(ErrorPos);
  227. end;
  228. case FRegexType of
  229. rtSingleChar :
  230. begin
  231. MatchPos := PosEx(char(FRegexStr[1]),s,Offset);
  232. Offset := MatchPos+1;
  233. Result := (MatchPos>0);
  234. end;
  235. rtChars :
  236. begin
  237. MatchPos := PosEx(FRegexStr,s,Offset);
  238. Offset := MatchPos+length(FRegexStr);
  239. Result := (MatchPos>0);
  240. end
  241. else
  242. begin
  243. {now try and see if the string matches (empty strings don't)}
  244. Result := False;
  245. MatchPos := 0;
  246. if (S <> '') then
  247. {if the regex specified a start anchor then we
  248. need to check the string starting at the first position}
  249. if FAnchorStart then begin
  250. if rcMatchSubString(S, 1, Offset) then
  251. begin
  252. MatchPos:=1;
  253. Result := True;
  254. end
  255. {If the first position did not match ang MultiLine is false, the string
  256. doesn't match. If MultiLine is true, start at every position after a
  257. newline }
  258. else if FMultiLine then begin
  259. for i := Offset to length(S)-1 do
  260. if S[i] in newline then
  261. if rcMatchSubString(S, i+1, Offset) then begin
  262. MatchPos := i+1;
  263. Result := True;
  264. Break;
  265. end;
  266. end
  267. end
  268. {otherwise we try and match the string at every position and
  269. return at the first success}
  270. else begin
  271. for i := Offset to length(S) do
  272. if rcMatchSubString(S, i, Offset) then begin
  273. MatchPos:=i;
  274. Result := True;
  275. Break;
  276. end;
  277. end;
  278. end;
  279. end; {case}
  280. end;
  281. function TRegexEngine.ReplaceAllString(const src, newstr: ansistring; out DestStr : string): Integer;
  282. type TReplRec = record
  283. Pos : integer;
  284. Len : integer;
  285. end;
  286. var ofs : Integer;
  287. size_newstr,
  288. size, pos : Integer;
  289. ReplArr : array of TReplRec;
  290. racount : integer;
  291. MatchPos : integer;
  292. DestSize : integer;
  293. LastPos : integer;
  294. MoveLen : integer;
  295. i : integer;
  296. begin
  297. setlength(ReplArr,64);
  298. racount := 0;
  299. DestSize:=length(src);
  300. size_newstr := length(newstr);
  301. Ofs := 1;
  302. while MatchString(src,MatchPos,Ofs) do
  303. begin
  304. if racount = length(ReplArr) then
  305. setlength(ReplArr,racount+racount div 2);
  306. ReplArr[racount].Pos := MatchPos;
  307. ReplArr[racount].Len := ofs;
  308. DestSize:=DestSize-ofs+MatchPos+size_newstr;
  309. inc(racount);
  310. end;
  311. SetLength(DestStr, SizeOf(Char)*DestSize);
  312. MatchPos:=1; LastPos:=1;
  313. if size_newstr<>0 then for i := 0 to racount -1 do
  314. begin
  315. MoveLen := ReplArr[i].Pos-LastPos;
  316. move(src[LastPos],DestStr[MatchPos],MoveLen);
  317. MatchPos:=MatchPos+MoveLen;
  318. LastPos := ReplArr[i].Len;
  319. move(newstr[1],DestStr[MatchPos],size_newstr);
  320. Matchpos := MatchPos+size_newstr;
  321. end
  322. else for i := 0 to racount -1 do
  323. begin
  324. MoveLen := ReplArr[i].Pos-LastPos;
  325. move(src[LastPos],DestStr[MatchPos],MoveLen);
  326. MatchPos:=MatchPos+MoveLen;
  327. LastPos := ReplArr[i].Len;
  328. end;
  329. move(src[LastPos],DestStr[MatchPos],length(src)-LastPos+1);
  330. Result := racount;
  331. end;
  332. {--------}
  333. function TRegexEngine.Parse(out aErrorPos : integer;
  334. out aErrorCode: TRegexError)
  335. : boolean;
  336. begin
  337. {clear the current transition table}
  338. rcClear;
  339. {empty regex strings are not allowed}
  340. if (FRegexStr = '') then begin
  341. Result := false;
  342. aErrorPos := 1;
  343. aErrorCode := recSuddenEnd;
  344. Exit;
  345. end;
  346. {parse the regex string}
  347. if not IgnoreCase then
  348. begin
  349. if length(FRegexStr)=1 then
  350. FRegexType:=rtSingleChar
  351. else
  352. FRegexType:=rtChars
  353. end
  354. else
  355. FRegexType:=rtRegEx;
  356. FPosn := PAnsiChar(FRegexStr);
  357. FStartState := rcParseAnchorExpr;
  358. {if an error occurred or we're not at the end of the regex string,
  359. clear the transition table, return false and the error position}
  360. if (FStartState = ErrorState) or (FPosn^ <> #0) then begin
  361. if (FStartState <> ErrorState) and (FPosn^ <> #0) then
  362. FErrorCode := recExtraChars;
  363. rcClear;
  364. Result := false;
  365. aErrorPos := succ(FPosn - PAnsiChar(FRegexStr));
  366. aErrorCode := FErrorCode;
  367. end
  368. {otherwise add a terminal state, optimize, return true}
  369. else begin
  370. rcAddState(mtTerminal, #0, nil, UnusedState, UnusedState);
  371. rcLevel1Optimize;
  372. if FAnchorStart or FAnchorEnd then FRegexType:= rtRegEx;
  373. Result := true;
  374. aErrorPos := 0;
  375. aErrorCode := recNone;
  376. end;
  377. end;
  378. {--------}
  379. function TRegexEngine.rcAddState(aMatchType : TNFAMatchType;
  380. aChar : AnsiChar;
  381. aCharClass : PCharSet;
  382. aNextState1: integer;
  383. aNextState2: integer) : integer;
  384. begin
  385. {set up the fields in the state record}
  386. with FStateTable[FStateCount] do
  387. begin
  388. if (aNextState1 = NewFinalState) then
  389. sdNextState1 := FStateCount+1
  390. else
  391. sdNextState1 := aNextState1;
  392. sdNextState2 := aNextState2;
  393. sdMatchType := aMatchType;
  394. if (aMatchType = mtChar) then
  395. sdChar := aChar
  396. else if aMatchType in [mtClass, mtDupClass, mtNegClass] then
  397. sdClass := aCharClass;
  398. end;
  399. Result := FStateCount;
  400. inc(FStateCount);
  401. if FStateCount=length(FStateTable) then
  402. setlength(FStateTable,(FStateCount * 3) div 2);
  403. if not (aMatchType in [mtChar,mtTerminal,mtNone]) then FRegexType := rtRegEx;
  404. end;
  405. {--------}
  406. procedure TRegexEngine.rcClear;
  407. var
  408. i, j : integer;
  409. begin
  410. {free all items in the state transition table}
  411. for i := 0 to FStateCount-1 do begin
  412. with FStateTable[i] do begin
  413. if (sdMatchType = mtClass) or
  414. (sdMatchType = mtNegClass) and
  415. (sdClass <> nil) then
  416. begin
  417. for j := i+1 to FStateCount-1 do
  418. if (FStateTable[j].sdClass = sdClass) then
  419. FStateTable[j].sdClass := nil;
  420. FreeMem(sdClass, sizeof(TCharSet));
  421. end;
  422. // I am not sure if the next line is necessary. rcAddState set all values, so
  423. // it shouldn't be necessary to clear its contents?
  424. // FillChar(FStateTable[i],SizeOf(FStateTable[i]),#0);
  425. end;
  426. end;
  427. {clear the state transition table}
  428. FStateCount:=0;
  429. FAnchorStart := false;
  430. FAnchorEnd := false;
  431. end;
  432. {--------}
  433. procedure TRegexEngine.rcError(aIndex : integer);
  434. begin
  435. raise Exception.Create(Format(eRegexParseError,[aIndex]));
  436. end;
  437. {--------}
  438. procedure TRegexEngine.rcLevel1Optimize;
  439. var
  440. i : integer;
  441. Walker : integer;
  442. begin
  443. {level 1 optimization removes all states that have only a single
  444. no-cost move to another state}
  445. {cycle through all the state records, except for the last one}
  446. for i := 0 to FStateCount - 2 do begin
  447. {get this state}
  448. with FStateTable[i] do begin
  449. {walk the chain pointed to by the first next state, unlinking
  450. the states that are simple single no-cost moves}
  451. Walker := sdNextState1;
  452. while (FStateTable[walker].sdMatchType = mtNone) and
  453. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  454. sdNextState1 := FStateTable[walker].sdNextState1;
  455. Walker := sdNextState1;
  456. end;
  457. {walk the chain pointed to by the second next state, unlinking
  458. the states that are simple single no-cost moves}
  459. if (sdNextState2 <> UnusedState) then begin
  460. Walker := sdNextState2;
  461. while (FStateTable[walker].sdMatchType = mtNone) and
  462. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  463. sdNextState2 := FStateTable[walker].sdNextState1;
  464. Walker := sdNextState2;
  465. end;
  466. end;
  467. end;
  468. end;
  469. {cycle through all the state records, except for the last one,
  470. marking unused ones--not strictly necessary but good for debugging}
  471. for i := 0 to FStateCount - 2 do begin
  472. with FStateTable[i] do begin
  473. if (sdMatchType = mtNone) and
  474. (sdNextState2 = UnusedState) then
  475. sdMatchType := mtUnused;
  476. end;
  477. end;
  478. end;
  479. {--------}
  480. function TRegexEngine.rcMatchSubString(const s : string;
  481. StartPosn : integer;
  482. var Len : integer)
  483. : boolean;
  484. var
  485. Ch : AnsiChar;
  486. State : integer;
  487. StrInx : integer;
  488. LenStr : integer;
  489. begin
  490. {assume we fail to match}
  491. Result := false;
  492. Len := StartPosn;
  493. LenStr := Length(s);
  494. {clear the deque}
  495. FHead := FCapacity div 2;
  496. FTail := FHead;
  497. {enqueue the special value to start scanning}
  498. DequeEnqueue(MustScan);
  499. {enqueue the first state}
  500. DequeEnqueue(FStartState);
  501. {prepare the string index}
  502. StrInx := StartPosn;
  503. {loop until the deque is empty or we run out of string}
  504. repeat
  505. {pop the top state from the deque}
  506. State := DequePop;
  507. {process the "must scan" state first}
  508. if (State = MustScan) then begin
  509. {if the deque is empty at this point, we might as well give up
  510. since there are no states left to process new characters}
  511. if (FHead <> FTail) then begin
  512. {if we haven't run out of string, get the character, and
  513. enqueue the "must scan" state again}
  514. if IgnoreCase then
  515. Ch := Upcase(s[StrInx])
  516. else
  517. Ch := s[StrInx];
  518. DequeEnqueue(MustScan);
  519. inc(StrInx);
  520. end;
  521. end
  522. {otherwise, process the state}
  523. else with FStateTable[State] do begin
  524. case sdMatchType of
  525. mtChar :
  526. begin
  527. {for a match of a character, enqueue the next state}
  528. if (Ch = sdChar) then
  529. DequeEnqueue(sdNextState1);
  530. end;
  531. mtAnyChar :
  532. begin
  533. {for a match of any character, enqueue the next state}
  534. if not (Ch in newline) then
  535. DequeEnqueue(sdNextState1);
  536. end;
  537. mtClass, mtDupClass :
  538. begin
  539. {for a match within a class, enqueue the next state}
  540. if (Ch in sdClass^) then
  541. DequeEnqueue(sdNextState1);
  542. end;
  543. mtNegClass :
  544. begin
  545. {for a match not within a class, enqueue the next state}
  546. if not (Ch in sdClass^) then
  547. DequeEnqueue(sdNextState1);
  548. end;
  549. mtTerminal :
  550. begin
  551. {for a terminal state, the string successfully matched
  552. if the regex had no end anchor, or we're at the end
  553. of the string or line}
  554. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  555. Result := true;
  556. Len := StrInx-1;
  557. // Exit;
  558. end;
  559. end;
  560. mtNone :
  561. begin
  562. {for free moves, push the next states onto the deque}
  563. Assert(sdNextState2 <> UnusedState,
  564. 'optimization should remove all states with one no-cost move');
  565. DequePush(sdNextState2);
  566. DequePush(sdNextState1);
  567. end;
  568. mtUnused :
  569. begin
  570. Assert(false, 'unused states shouldn''t be seen');
  571. end;
  572. end;
  573. end;
  574. until (FHead = FTail) or (StrInx > LenStr); // deque empty or end of string
  575. {if we reach this point we've either exhausted the deque or we've
  576. run out of string; if the former, the substring did not match
  577. since there are no more states. If the latter, we need to check
  578. the states left on the deque to see if one is the terminating
  579. state; if so the string matched the regular expression defined by
  580. the transition table}
  581. while (FHead <> FTail) and (StrInx<=LenStr) do begin
  582. State := DequePop;
  583. with FStateTable[State] do begin
  584. case sdMatchType of
  585. mtNone :
  586. begin
  587. {for free moves, push the next states onto the deque}
  588. Assert(sdNextState2 <> UnusedState,
  589. 'optimization should remove all states with one no-cost move');
  590. DequePush(sdNextState2);
  591. DequePush(sdNextState1);
  592. end;
  593. mtTerminal :
  594. begin
  595. {for a terminal state, the string successfully matched
  596. if the regex had no end anchor, or we're at the end
  597. of the string or line}
  598. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  599. Result := true;
  600. Len := StrInx -1;
  601. Exit;
  602. end;
  603. end;
  604. end;{case}
  605. end;
  606. end;
  607. end;
  608. {--------}
  609. function TRegexEngine.rcParseAnchorExpr : integer;
  610. begin
  611. {check for an initial '^'}
  612. if (FPosn^ = '^') then begin
  613. FAnchorStart := true;
  614. inc(FPosn);
  615. end;
  616. {parse an expression}
  617. Result := rcParseExpr;
  618. {if we were successful, check for the final '$'}
  619. if (Result <> ErrorState) then begin
  620. if (FPosn^ = '$') then begin
  621. FAnchorEnd := true;
  622. inc(FPosn);
  623. end;
  624. end;
  625. end;
  626. {--------}
  627. function TRegexEngine.rcParseAtom : integer;
  628. var
  629. MatchType : TNFAMatchType;
  630. CharClass : PCharSet;
  631. begin
  632. case FPosn^ of
  633. '(' :
  634. begin
  635. {move past the open parenthesis}
  636. inc(FPosn);
  637. {parse a complete regex between the parentheses}
  638. Result := rcParseExpr;
  639. if (Result = ErrorState) then
  640. Exit;
  641. {if the current character is not a close parenthesis,
  642. there's an error}
  643. if (FPosn^ <> ')') then begin
  644. FErrorCode := recNoCloseParen;
  645. Result := ErrorState;
  646. Exit;
  647. end;
  648. {move past the close parenthesis}
  649. inc(FPosn);
  650. {always handle expressions with parentheses as regular-expression}
  651. FRegexType := rtRegEx;
  652. end;
  653. '[' :
  654. begin
  655. {move past the open square bracket}
  656. inc(FPosn);
  657. {if the first character in the class is a '^' then the
  658. class if negated, otherwise it's a normal one}
  659. if (FPosn^ = '^') then begin
  660. inc(FPosn);
  661. MatchType := mtNegClass;
  662. end
  663. else begin
  664. MatchType := mtClass;
  665. end;
  666. {allocate the class character set and parse the character
  667. class; this will return either with an error, or when the
  668. closing square bracket is encountered}
  669. New(CharClass);
  670. CharClass^ := [];
  671. if not rcParseCharClass(CharClass) then begin
  672. Dispose(CharClass);
  673. Result := ErrorState;
  674. Exit;
  675. end;
  676. {move past the closing square bracket}
  677. Assert(FPosn^ = ']',
  678. 'the rcParseCharClass terminated without finding a "]"');
  679. inc(FPosn);
  680. {add a new state for the character class}
  681. Result := rcAddState(MatchType, #0, CharClass,
  682. NewFinalState, UnusedState);
  683. end;
  684. '.' :
  685. begin
  686. {move past the period metacharacter}
  687. inc(FPosn);
  688. {add a new state for the 'any character' token}
  689. Result := rcAddState(mtAnyChar, #0, nil,
  690. NewFinalState, UnusedState);
  691. end;
  692. '\' :
  693. begin
  694. if (FPosn+1)^ in ['d','D','s','S','w','W'] then begin
  695. New(CharClass);
  696. CharClass^ := [];
  697. if not rcParseCharRange(CharClass) then begin
  698. Dispose(CharClass);
  699. Result := ErrorState;
  700. Exit;
  701. end;
  702. Result := rcAddState(mtClass, #0, CharClass,
  703. NewFinalState, UnusedState);
  704. end
  705. else
  706. Result := rcParseChar;
  707. end;
  708. else
  709. {otherwise parse a single character}
  710. Result := rcParseChar;
  711. end;{case}
  712. end;
  713. {--------}
  714. function TRegexEngine.rcParseCCChar(out EscapeChar : Boolean) : AnsiChar;
  715. begin
  716. EscapeChar:=False;
  717. {if we hit the end of the string, it's an error}
  718. if (FPosn^ = #0) then begin
  719. FErrorCode := recSuddenEnd;
  720. Result := #0;
  721. Exit;
  722. end;
  723. {if the current char is a metacharacter (at least in terms of a
  724. character class), it's an error}
  725. if FPosn^ in [']', '-'] then begin
  726. FErrorCode := recMetaChar;
  727. Result := #0;
  728. Exit;
  729. end;
  730. {otherwise return the character and advance past it}
  731. if (FPosn^ = '\') then
  732. {..it's an escaped character: get the next character instead}
  733. begin
  734. inc(FPosn);
  735. EscapeChar:=True;
  736. Result := rcReturnEscapeChar;
  737. end
  738. else
  739. Result := FPosn^;
  740. inc(FPosn);
  741. end;
  742. {--------}
  743. function TRegexEngine.rcParseChar : integer;
  744. var
  745. Ch : AnsiChar;
  746. begin
  747. {if we hit the end of the string, it's an error}
  748. if (FPosn^ = #0) then begin
  749. Result := ErrorState;
  750. FErrorCode := recSuddenEnd;
  751. Exit;
  752. end;
  753. {if the current char is one of the metacharacters, it's an error}
  754. if FPosn^ in MetaCharacters then begin
  755. Result := ErrorState;
  756. FErrorCode := recMetaChar;
  757. Exit;
  758. end;
  759. {otherwise add a state for the character}
  760. {..if it's an escaped character: get the next character instead}
  761. if (FPosn^ = '\') then
  762. begin
  763. inc(FPosn);
  764. ch := rcReturnEscapeChar;
  765. FRegexType := rtRegEx;
  766. end
  767. else
  768. ch :=FPosn^;
  769. if IgnoreCase then
  770. Ch := Upcase(ch);
  771. Result := rcAddState(mtChar, Ch, nil, NewFinalState, UnusedState);
  772. inc(FPosn);
  773. end;
  774. {--------}
  775. function TRegexEngine.rcParseCharClass(aClass : PCharSet) : boolean;
  776. begin
  777. {assume we can't parse a character class properly}
  778. Result := false;
  779. {parse a character range; if we can't there was an error and the
  780. caller will take care of it}
  781. if not rcParseCharRange(aClass) then
  782. Exit;
  783. {if the current character was not the right bracket, parse another
  784. character class (note: we're removing the tail recursion here)}
  785. while (FPosn^ <> ']') do begin
  786. if not rcParseCharRange(aClass) then
  787. Exit;
  788. end;
  789. {if we reach here we were successful}
  790. Result := true;
  791. end;
  792. {--------}
  793. function TRegexEngine.rcParseCharRange(aClass : PCharSet) : boolean;
  794. var
  795. StartChar : AnsiChar;
  796. EndChar : AnsiChar;
  797. Ch : AnsiChar;
  798. EscChar : Boolean;
  799. begin
  800. {assume we can't parse a character range properly}
  801. Result := false;
  802. {parse a single character; if it's null there was an error}
  803. StartChar := rcParseCCChar(EscChar);
  804. if (StartChar = #0) then
  805. Exit;
  806. if EscChar then
  807. begin
  808. case StartChar of
  809. 'd' : aClass^ := aClass^ + cs_digits;
  810. 'D' : aClass^ := aClass^ + cs_allchars-cs_digits;
  811. 's' : aClass^ := aClass^ + cs_whitespace;
  812. 'S' : aClass^ := aClass^ + cs_allchars-cs_whitespace;
  813. 'w' : aClass^ := aClass^ + cs_wordchars;
  814. 'W' : aClass^ := aClass^ + cs_allchars-cs_wordchars
  815. else
  816. EscChar := False;
  817. end;
  818. if EscChar then
  819. begin
  820. Result := True;
  821. Exit;
  822. end;
  823. end;
  824. {if the current character is not a dash, the range consisted of a
  825. single character}
  826. if (FPosn^ <> '-') then begin
  827. if IgnoreCase then
  828. Include(aClass^, Upcase(StartChar))
  829. else
  830. Include(aClass^, StartChar)
  831. end
  832. {otherwise it's a real range, so get the character at the end of the
  833. range; if that's null, there was an error}
  834. else begin
  835. inc(FPosn); {move past the '-'}
  836. EndChar := rcParseCCChar(EscChar);
  837. if (EndChar = #0) then
  838. Exit;
  839. {build the range as a character set}
  840. if (StartChar > EndChar) then begin
  841. Ch := StartChar;
  842. StartChar := EndChar;
  843. EndChar := Ch;
  844. end;
  845. for Ch := StartChar to EndChar do begin
  846. Include(aClass^, Ch);
  847. if IgnoreCase then
  848. Include(aClass^, Upcase(Ch));
  849. end;
  850. end;
  851. {if we reach here we were successful}
  852. Result := true;
  853. end;
  854. {--------}
  855. function TRegexEngine.rcParseExpr : integer;
  856. var
  857. StartState1 : integer;
  858. StartState2 : integer;
  859. EndState1 : integer;
  860. OverallStartState : integer;
  861. begin
  862. {assume the worst}
  863. Result := ErrorState;
  864. {parse an initial term}
  865. StartState1 := rcParseTerm;
  866. if (StartState1 = ErrorState) then
  867. Exit;
  868. {if the current character is *not* a pipe character, no alternation
  869. is present so return the start state of the initial term as our
  870. start state}
  871. if (FPosn^ <> '|') then
  872. Result := StartState1
  873. {otherwise, we need to parse another expr and join the two together
  874. in the transition table}
  875. else begin
  876. {advance past the pipe}
  877. inc(FPosn);
  878. {the initial term's end state does not exist yet (although there
  879. is a state in the term that points to it), so create it}
  880. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  881. {for the OR construction we need a new initial state: it will
  882. point to the initial term and the second just-about-to-be-parsed
  883. expr}
  884. OverallStartState := rcAddState(mtNone, #0, nil,
  885. UnusedState, UnusedState);
  886. {parse another expr}
  887. StartState2 := rcParseExpr;
  888. if (StartState2 = ErrorState) then
  889. Exit;
  890. {alter the state state for the overall expr so that the second
  891. link points to the start of the second expr}
  892. Result := rcSetState(OverallStartState, StartState1, StartState2);
  893. {now set the end state for the initial term to point to the final
  894. end state for the second expr and the overall expr}
  895. rcSetState(EndState1, FStateCount, UnusedState);
  896. {always handle expressions with a pipe as regular-expression}
  897. FRegexType := rtRegEx;
  898. end;
  899. end;
  900. {--------}
  901. function TRegexEngine.rcParseFactor : integer;
  902. var
  903. StartStateAtom : integer;
  904. EndStateAtom : integer;
  905. TempEndStateAtom : integer;
  906. Int : string;
  907. n,m,nState : integer;
  908. i : integer;
  909. begin
  910. {assume the worst}
  911. Result := ErrorState;
  912. {first parse an atom}
  913. StartStateAtom := rcParseAtom;
  914. if (StartStateAtom = ErrorState) then
  915. Exit;
  916. {check for a closure operator}
  917. case FPosn^ of
  918. '?' : begin
  919. {move past the ? operator}
  920. inc(FPosn);
  921. {the atom's end state doesn't exist yet, so create one}
  922. EndStateAtom := rcAddState(mtNone, #0, nil,
  923. UnusedState, UnusedState);
  924. {create a new start state for the overall regex}
  925. Result := rcAddState(mtNone, #0, nil,
  926. StartStateAtom, EndStateAtom);
  927. {make sure the new end state points to the next unused
  928. state}
  929. rcSetState(EndStateAtom, FStateCount, UnusedState);
  930. end;
  931. '*' : begin
  932. {move past the * operator}
  933. inc(FPosn);
  934. {the atom's end state doesn't exist yet, so create one;
  935. it'll be the start of the overall regex subexpression}
  936. Result := rcAddState(mtNone, #0, nil,
  937. NewFinalState, StartStateAtom);
  938. end;
  939. '+' : begin
  940. {move past the + operator}
  941. inc(FPosn);
  942. {the atom's end state doesn't exist yet, so create one}
  943. rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom);
  944. {the start of the overall regex subexpression will be the
  945. atom's start state}
  946. Result := StartStateAtom;
  947. end;
  948. '{' : begin // {n,m}
  949. {move past the brace }
  950. inc(FPosn);
  951. {Parse the value of n}
  952. Int := '';
  953. while not (FPosn^ in [',','}',#0]) do
  954. begin
  955. int := int+FPosn^;
  956. inc(FPosn);
  957. end;
  958. if FPosn^ = #0 then exit; // No end-brace or comma -> invalid regex
  959. if int <> '' then
  960. n := StrToIntDef(Int,-2)
  961. else
  962. n := -1; // if n is 'empty', set it to -1
  963. if n = -2 then exit; // Invalid value for n -> invalid RegEx
  964. if FPosn^ <> '}' then
  965. begin
  966. {move past the , }
  967. inc(FPosn);
  968. {Parse the value of m}
  969. Int := '';
  970. while not (FPosn^ in ['}',#0]) do
  971. begin
  972. int := int+FPosn^;
  973. inc(FPosn);
  974. end;
  975. if FPosn^ <> '}' then exit; // No end-brace -> invalid regex
  976. if int <> '' then m := StrToIntDef(Int,-2)
  977. else m := -1;
  978. if m = -2 then exit; // Invalid RegEx
  979. end
  980. else
  981. m := -3;
  982. {move past the brace }
  983. inc(FPosn);
  984. if (n=0) and (m=-1) then
  985. {the atom's end state doesn't exist yet, so create one;
  986. it'll be the start of the overall regex subexpression}
  987. Result := rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom)
  988. else
  989. begin
  990. EndStateAtom := FStateCount-1;
  991. TempEndStateAtom:=StartStateAtom;
  992. for i := 1 to n-1 do
  993. begin
  994. TempEndStateAtom:=FStateCount;
  995. for nState:=StartStateAtom to EndStateAtom do
  996. begin
  997. FStateTable[FStateCount]:=FStateTable[nState];
  998. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  999. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) *i;
  1000. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  1001. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) *i;
  1002. if FStateTable[FStateCount].sdMatchType = mtClass then
  1003. FStateTable[FStateCount].sdMatchType := mtDupClass;
  1004. inc(FStateCount);
  1005. if FStateCount=length(FStateTable) then
  1006. setlength(FStateTable,(FStateCount * 3) div 2);
  1007. end;
  1008. end;
  1009. for i := n to m-1 do
  1010. begin
  1011. rcAddState(mtNone, #0, nil, NewFinalState, EndStateAtom+(EndStateAtom-StartStateAtom+1) * (m-1) + (m-n)+1);
  1012. TempEndStateAtom:=FStateCount;
  1013. for nState:=StartStateAtom to EndStateAtom do
  1014. begin
  1015. FStateTable[FStateCount]:=FStateTable[nState];
  1016. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  1017. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) * i+(i-n+1);
  1018. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  1019. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) * i+(i-n+1);
  1020. if FStateTable[FStateCount].sdMatchType = mtClass then
  1021. FStateTable[FStateCount].sdMatchType := mtDupClass;
  1022. inc(FStateCount);
  1023. if FStateCount=length(FStateTable) then
  1024. setlength(FStateTable,(FStateCount * 3) div 2);
  1025. end;
  1026. end;
  1027. if m = -1 then
  1028. rcAddState(mtNone, #0, nil, NewFinalState, TempEndStateAtom);
  1029. Result := StartStateAtom;
  1030. end;
  1031. {always handle expressions with braces as regular-expression}
  1032. FRegexType := rtRegEx;
  1033. end;
  1034. else
  1035. Result := StartStateAtom;
  1036. end;{case}
  1037. end;
  1038. {--------}
  1039. function TRegexEngine.rcParseTerm : integer;
  1040. var
  1041. StartState2 : integer;
  1042. EndState1 : integer;
  1043. begin
  1044. {parse an initial factor, the state number returned will also be our
  1045. return state number}
  1046. Result := rcParseFactor;
  1047. if (Result = ErrorState) then
  1048. Exit;
  1049. {Note: we have to "break the grammar" here. We've parsed a regular
  1050. subexpression and we're possibly following on with another
  1051. regular subexpression. There's no nice operator to key off
  1052. for concatenation: we just have to know that for
  1053. concatenating two subexpressions, the current character will
  1054. be
  1055. - an open parenthesis
  1056. - an open square bracket
  1057. - an any char operator
  1058. - a character that's not a metacharacter
  1059. i.e., the three possibilities for the start of an "atom" in
  1060. our grammar}
  1061. if (FPosn^ = '(') or
  1062. (FPosn^ = '[') or
  1063. (FPosn^ = '.') or
  1064. ((FPosn^ <> #0) and not (FPosn^ in MetaCharacters)) then begin
  1065. {the initial factor's end state does not exist yet (although there
  1066. is a state in the term that points to it), so create it}
  1067. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  1068. {parse another term}
  1069. StartState2 := rcParseTerm;
  1070. if (StartState2 = ErrorState) then begin
  1071. Result := ErrorState;
  1072. Exit;
  1073. end;
  1074. {join the first factor to the second term}
  1075. rcSetState(EndState1, StartState2, UnusedState);
  1076. end;
  1077. end;
  1078. procedure TRegexEngine.WriteTable;
  1079. var i : integer;
  1080. begin
  1081. for i := 0 to FStateCount-1 do with FStateTable[i] do
  1082. writeln('s:',i,' mt:',sdMatchType ,' ns1:',sdNextState1,' ns2:',sdNextState2,' char:',sdChar);
  1083. end;
  1084. procedure TRegexEngine.DequeEnqueue(aValue: integer);
  1085. begin
  1086. FList[FTail] := aValue;
  1087. inc(FTail);
  1088. if (FTail = FCapacity) then
  1089. FTail := 0
  1090. else if (FTail = FHead) then
  1091. DequeGrow;
  1092. end;
  1093. procedure TRegexEngine.DequePush(aValue: integer);
  1094. begin
  1095. if (FHead = 0) then
  1096. FHead := FCapacity;
  1097. dec(FHead);
  1098. FList[FHead] := aValue;
  1099. if (FTail = FHead) then
  1100. DequeGrow;
  1101. end;
  1102. function TRegexEngine.DequePop: integer;
  1103. begin
  1104. Result := FList[FHead];
  1105. inc(FHead);
  1106. if (FHead = FCapacity) then
  1107. FHead := 0;
  1108. end;
  1109. procedure TRegexEngine.DequeGrow;
  1110. var
  1111. OldCount : integer;
  1112. i, j : integer;
  1113. begin
  1114. {grow the list by 50%}
  1115. OldCount := FCapacity;
  1116. FCapacity:=(OldCount * 3) div 2;
  1117. SetLength(FList,FCapacity);
  1118. {expand the data into the increased space, maintaining the deque}
  1119. if (FHead = 0) then
  1120. FTail := OldCount
  1121. else begin
  1122. j := FCapacity;
  1123. for i := pred(OldCount) downto FHead do begin
  1124. dec(j);
  1125. FList[j] := FList[i]
  1126. end;
  1127. FHead := j;
  1128. end;
  1129. end;
  1130. function TRegexEngine.rcReturnEscapeChar: AnsiChar;
  1131. begin
  1132. case FPosn^ of
  1133. 't' : Result := #9;
  1134. 'n' : Result := #10;
  1135. 'r' : Result := #13;
  1136. 'f' : Result := #12;
  1137. 'a' : Result := #7;
  1138. else
  1139. Result := FPosn^;
  1140. end;
  1141. end;
  1142. {--------}
  1143. procedure TRegexEngine.rcSetIgnoreCase(aValue : boolean);
  1144. begin
  1145. if (aValue <> FIgnoreCase) then begin
  1146. rcClear;
  1147. FIgnoreCase := aValue;
  1148. end;
  1149. end;
  1150. {--------}
  1151. procedure TRegexEngine.rcSetRegexStr(const aRegexStr : string);
  1152. begin
  1153. if (aRegexStr <> FRegexStr) then begin
  1154. rcClear;
  1155. FRegexStr := aRegexStr;
  1156. end;
  1157. end;
  1158. {--------}
  1159. function TRegexEngine.rcSetState(aState : integer;
  1160. aNextState1: integer;
  1161. aNextState2: integer) : integer;
  1162. begin
  1163. Assert((0 <= aState) and (aState < FStateCount),
  1164. 'trying to change an invalid state');
  1165. {get the state record and change the transition information}
  1166. FStateTable[aState].sdNextState1 := aNextState1;
  1167. FStateTable[aState].sdNextState2 := aNextState2;
  1168. Result := aState;
  1169. end;
  1170. {--------}
  1171. procedure TRegexEngine.rcSetUpcase(aValue : TUpcaseFunc);
  1172. begin
  1173. if not Assigned(aValue) then
  1174. FUpcase := SystemUpcase
  1175. else
  1176. FUpcase := aValue;
  1177. end;
  1178. procedure TRegexEngine.rcSetMultiLine(aValue: Boolean);
  1179. begin
  1180. FMultiLine:=aValue;
  1181. end;
  1182. {====================================================================}
  1183. end.