2
0

regex.pp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266
  1. {
  2. This file is part of the Free Pascal packages library.
  3. Copyright (c) 2008 by Joost van der Sluis, member of the
  4. Free Pascal development team
  5. Regexpression parser
  6. This code is based on the examples in the book
  7. 'Tomes of Delphi: Algorithms and Data Structures' by Julian M Bucknall
  8. The code is used with his permission. For an excellent explanation of
  9. this unit, see the book...
  10. See the file COPYING.FPC, included in this distribution,
  11. for details about the copyright.
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. **********************************************************************}
  16. unit Regex;
  17. {$mode Delphi}{$H+}
  18. {$INLINE ON}
  19. interface
  20. {Notes:
  21. these classes parse regular expressions that follow this grammar:
  22. <anchorexpr> ::= <expr> |
  23. '^' <expr> |
  24. <expr> '$' |
  25. '^' <expr> '$'
  26. <expr> ::= <term> |
  27. <term> '|' <expr> - alternation
  28. <term> ::= <factor> |
  29. <factor><term> - concatenation
  30. <factor> ::= <atom> |
  31. <atom> '?' | - zero or one
  32. <atom> '*' | - zero or more
  33. <atom> 'n,m' | - min n, max m (added by Joost)
  34. <atom> '+' - one or more
  35. <atom> ::= <char> |
  36. '.' | - any char
  37. '(' <expr> ') | - parentheses
  38. '[' <charclass> ']' | - normal class
  39. '[^' <charclass> ']' - negated class
  40. <charclass> ::= <charrange> |
  41. <charrange><charclass>
  42. <charrange> ::= <ccchar> |
  43. <ccchar> '-' <ccchar>
  44. <char> ::= <any character except metacharacters> |
  45. '\' <any character at all>
  46. <ccchar> ::= <any character except '-' and ']'> |
  47. '\' <any character at all>
  48. This means that parentheses have maximum precedence, followed
  49. by square brackets, followed by the closure operators,
  50. followed by concatenation, finally followed by alternation.
  51. }
  52. uses
  53. SysUtils,
  54. Classes;
  55. type
  56. TUpcaseFunc = function(aCh : AnsiChar) : AnsiChar;
  57. TNFAMatchType = ( {types of matching performed...}
  58. mtNone, {..no match (an epsilon no-cost move)}
  59. mtAnyChar, {..any character}
  60. mtChar, {..a particular character}
  61. mtClass, {..a character class}
  62. mtNegClass, {..a negated character class}
  63. mtTerminal, {..the final state--no matching}
  64. mtUnused); {..an unused state--no matching}
  65. TRegexError = ( {error codes for invalid regex strings}
  66. recNone, {..no error}
  67. recSuddenEnd, {..unexpected end of string}
  68. recMetaChar, {..read metacharacter, but needed normal char}
  69. recNoCloseParen, {..expected close paren, but not there}
  70. recExtraChars {..not at end of string after parsing regex}
  71. );
  72. TRegexType = (
  73. rtRegEx,
  74. rtChars,
  75. rtSingleChar
  76. );
  77. PCharSet = ^TCharSet;
  78. TCharSet = set of Char;
  79. { TtdRegexEngine }
  80. TNFAState = record
  81. sdNextState1: integer;
  82. sdNextState2: integer;
  83. sdClass : PCharSet;
  84. sdMatchType : TNFAMatchType;
  85. sdChar : AnsiChar;
  86. end;
  87. { TRegexEngine }
  88. TRegexEngine = class
  89. private
  90. FAnchorEnd : boolean;
  91. FAnchorStart: boolean;
  92. FErrorCode : TRegexError;
  93. FIgnoreCase : boolean;
  94. FMultiLine : boolean;
  95. FPosn : PAnsiChar;
  96. FRegexStr : string;
  97. FStartState : integer;
  98. FStateTable : Array of TNFAState;
  99. FStateCount : integer;
  100. FUpcase : TUpcaseFunc;
  101. // The deque (double-ended queue)
  102. FList : array of integer;
  103. FCapacity : integer;
  104. FHead : integer;
  105. FTail : integer;
  106. FRegexType : TRegexType;
  107. protected
  108. procedure DequeEnqueue(aValue : integer);
  109. procedure DequePush(aValue : integer);
  110. function DequePop : integer;
  111. procedure DequeGrow;
  112. procedure rcSetIgnoreCase(aValue : boolean); virtual;
  113. procedure rcSetRegexStr(const aRegexStr : string); virtual;
  114. procedure rcSetUpcase(aValue : TUpcaseFunc); virtual;
  115. procedure rcSetMultiLine(aValue : Boolean); virtual;
  116. procedure rcClear; virtual;
  117. procedure rcError(aIndex : integer); virtual;
  118. procedure rcLevel1Optimize; virtual;
  119. function rcMatchSubString(const S : string;
  120. StartPosn : integer;
  121. var Len : integer) : boolean; virtual;
  122. function rcAddState(aMatchType : TNFAMatchType;
  123. aChar : AnsiChar;
  124. aCharClass : PCharSet;
  125. aNextState1: integer;
  126. aNextState2: integer) : integer;
  127. function rcSetState(aState : integer;
  128. aNextState1: integer;
  129. aNextState2: integer) : integer;
  130. function rcParseAnchorExpr : integer; virtual;
  131. function rcParseAtom : integer; virtual;
  132. function rcParseCCChar(out EscapeChar : Boolean) : AnsiChar; virtual;
  133. function rcParseChar : integer; virtual;
  134. function rcParseCharClass(aClass : PCharSet) : boolean; virtual;
  135. function rcParseCharRange(aClass : PCharSet) : boolean; virtual;
  136. function rcParseExpr : integer; virtual;
  137. function rcParseFactor : integer; virtual;
  138. function rcParseTerm : integer; virtual;
  139. Function rcReturnEscapeChar : AnsiChar; virtual;
  140. public
  141. procedure WriteTable;
  142. constructor Create(const aRegexStr : string);
  143. destructor Destroy; override;
  144. function Parse(out aErrorPos : integer;
  145. out aErrorCode: TRegexError) : boolean; virtual;
  146. function MatchString(const S : string; out MatchPos : integer; var Offset : integer) : boolean; virtual;
  147. function ReplaceAllString(const src, newstr: ansistring; out DestStr : string): Integer;
  148. property IgnoreCase : boolean
  149. read FIgnoreCase write rcSetIgnoreCase;
  150. property MultiLine : boolean
  151. read FMultiLine write rcSetMultiLine;
  152. property RegexString : string
  153. read FRegexStr write rcSetRegexStr;
  154. property Upcase : TUpcaseFunc
  155. read FUpcase write rcSetUpcase;
  156. end;
  157. Resourcestring
  158. eRegexParseError = 'Error at %d when parsing regular expression';
  159. implementation
  160. uses strutils;
  161. const
  162. MetaCharacters : set of AnsiChar =
  163. ['[', ']', '(', ')', '|', '*', '+', '?', '-', '.',
  164. '^', '$', '{', '}'];
  165. newline : TCharSet = [#10,#13,#$85];
  166. {some handy constants}
  167. UnusedState = -1;
  168. NewFinalState = -2;
  169. CreateNewState = -3;
  170. ErrorState = -4;
  171. MustScan = -5;
  172. cs_allchars : tcharset = [#0..#255];
  173. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  174. cs_newline : tcharset = [#10];
  175. cs_digits : tcharset = ['0'..'9'];
  176. cs_whitespace : tcharset = [' ',#9];
  177. {===Helper routines==================================================}
  178. function SystemUpcase(aCh : AnsiChar) : AnsiChar; far;
  179. begin
  180. Result := System.Upcase(aCh);
  181. end;
  182. {====================================================================}
  183. {===TRegexEngine===================================================}
  184. constructor TRegexEngine.Create(const aRegexStr : string);
  185. begin
  186. inherited Create;
  187. FRegexStr := aRegexStr;
  188. FIgnoreCase := false;
  189. FUpcase := SystemUpcase;
  190. SetLength(FStateTable,64);
  191. FStateCount:=0;
  192. FCapacity:=64;
  193. setlength(FList,FCapacity);
  194. {let's help out the user of the deque by putting the head and
  195. tail pointers in the middle: it's probably more efficient}
  196. FHead := FCapacity div 2;
  197. FTail := FHead;
  198. MultiLine:=False;
  199. end;
  200. {--------}
  201. destructor TRegexEngine.Destroy;
  202. begin
  203. if (FStateTable <> nil) then
  204. rcClear;
  205. inherited Destroy;
  206. end;
  207. {--------}
  208. function TRegexEngine.MatchString(const S : string; out MatchPos : integer; var Offset : integer): boolean;
  209. var
  210. i : integer;
  211. ErrorPos : integer;
  212. ErrorCode : TRegexError;
  213. pc : pchar;
  214. x:integer;
  215. begin
  216. if Offset>length(S) then
  217. begin
  218. Result := False;
  219. MatchPos := 0;
  220. Exit;
  221. end;
  222. {if the regex string hasn't been parsed yet, do so}
  223. if (FStateCount = 0) then begin
  224. if not Parse(ErrorPos, ErrorCode) then
  225. rcError(ErrorPos);
  226. end;
  227. case FRegexType of
  228. rtSingleChar :
  229. begin
  230. MatchPos := PosEx(char(FRegexStr[1]),s,Offset);
  231. Offset := MatchPos+1;
  232. Result := (MatchPos>0);
  233. end;
  234. rtChars :
  235. begin
  236. MatchPos := PosEx(FRegexStr,s,Offset);
  237. Offset := MatchPos+length(FRegexStr);
  238. Result := (MatchPos>0);
  239. end
  240. else
  241. begin
  242. {now try and see if the string matches (empty strings don't)}
  243. Result := False;
  244. MatchPos := 0;
  245. if (S <> '') then
  246. {if the regex specified a start anchor then we
  247. need to check the string starting at the first position}
  248. if FAnchorStart then begin
  249. if rcMatchSubString(S, 1, Offset) then
  250. begin
  251. MatchPos:=1;
  252. Result := True;
  253. end
  254. {If the first position did not match ang MultiLine is false, the string
  255. doesn't match. If MultiLine is true, start at every position after a
  256. newline }
  257. else if FMultiLine then begin
  258. for i := Offset to length(S)-1 do
  259. if S[i] in newline then
  260. if rcMatchSubString(S, i+1, Offset) then begin
  261. MatchPos := i+1;
  262. Result := True;
  263. Break;
  264. end;
  265. end
  266. end
  267. {otherwise we try and match the string at every position and
  268. return at the first success}
  269. else begin
  270. for i := Offset to length(S) do
  271. if rcMatchSubString(S, i, Offset) then begin
  272. MatchPos:=i;
  273. Result := True;
  274. Break;
  275. end;
  276. end;
  277. end;
  278. end; {case}
  279. end;
  280. function TRegexEngine.ReplaceAllString(const src, newstr: ansistring; out DestStr : string): Integer;
  281. type TReplRec = record
  282. Pos : integer;
  283. Len : integer;
  284. end;
  285. var ofs : Integer;
  286. size_newstr,
  287. size, pos : Integer;
  288. ReplArr : array of TReplRec;
  289. racount : integer;
  290. MatchPos : integer;
  291. DestSize : integer;
  292. LastPos : integer;
  293. MoveLen : integer;
  294. i : integer;
  295. begin
  296. setlength(ReplArr,64);
  297. racount := 0;
  298. DestSize:=length(src);
  299. size_newstr := length(newstr);
  300. Ofs := 1;
  301. while MatchString(src,MatchPos,Ofs) do
  302. begin
  303. if racount = length(ReplArr) then
  304. setlength(ReplArr,racount+racount div 2);
  305. ReplArr[racount].Pos := MatchPos;
  306. ReplArr[racount].Len := ofs;
  307. DestSize:=DestSize-ofs+MatchPos+size_newstr;
  308. inc(racount);
  309. end;
  310. SetLength(DestStr, SizeOf(Char)*DestSize);
  311. MatchPos:=1; LastPos:=1;
  312. if size_newstr<>0 then for i := 0 to racount -1 do
  313. begin
  314. MoveLen := ReplArr[i].Pos-LastPos;
  315. move(src[LastPos],DestStr[MatchPos],MoveLen);
  316. MatchPos:=MatchPos+MoveLen;
  317. LastPos := ReplArr[i].Len;
  318. move(newstr[1],DestStr[MatchPos],size_newstr);
  319. Matchpos := MatchPos+size_newstr;
  320. end
  321. else for i := 0 to racount -1 do
  322. begin
  323. MoveLen := ReplArr[i].Pos-LastPos;
  324. move(src[LastPos],DestStr[MatchPos],MoveLen);
  325. MatchPos:=MatchPos+MoveLen;
  326. LastPos := ReplArr[i].Len;
  327. end;
  328. move(src[LastPos],DestStr[MatchPos],length(src)-LastPos+1);
  329. Result := racount;
  330. end;
  331. {--------}
  332. function TRegexEngine.Parse(out aErrorPos : integer;
  333. out aErrorCode: TRegexError)
  334. : boolean;
  335. begin
  336. {clear the current transition table}
  337. rcClear;
  338. {empty regex strings are not allowed}
  339. if (FRegexStr = '') then begin
  340. Result := false;
  341. aErrorPos := 1;
  342. aErrorCode := recSuddenEnd;
  343. Exit;
  344. end;
  345. {parse the regex string}
  346. if not IgnoreCase then
  347. begin
  348. if length(FRegexStr)=1 then
  349. FRegexType:=rtSingleChar
  350. else
  351. FRegexType:=rtChars
  352. end
  353. else
  354. FRegexType:=rtRegEx;
  355. FPosn := PAnsiChar(FRegexStr);
  356. FStartState := rcParseAnchorExpr;
  357. {if an error occurred or we're not at the end of the regex string,
  358. clear the transition table, return false and the error position}
  359. if (FStartState = ErrorState) or (FPosn^ <> #0) then begin
  360. if (FStartState <> ErrorState) and (FPosn^ <> #0) then
  361. FErrorCode := recExtraChars;
  362. rcClear;
  363. Result := false;
  364. aErrorPos := succ(FPosn - PAnsiChar(FRegexStr));
  365. aErrorCode := FErrorCode;
  366. end
  367. {otherwise add a terminal state, optimize, return true}
  368. else begin
  369. rcAddState(mtTerminal, #0, nil, UnusedState, UnusedState);
  370. rcLevel1Optimize;
  371. if FAnchorStart or FAnchorEnd then FRegexType:= rtRegEx;
  372. Result := true;
  373. aErrorPos := 0;
  374. aErrorCode := recNone;
  375. end;
  376. end;
  377. {--------}
  378. function TRegexEngine.rcAddState(aMatchType : TNFAMatchType;
  379. aChar : AnsiChar;
  380. aCharClass : PCharSet;
  381. aNextState1: integer;
  382. aNextState2: integer) : integer;
  383. begin
  384. {set up the fields in the state record}
  385. with FStateTable[FStateCount] do
  386. begin
  387. if (aNextState1 = NewFinalState) then
  388. sdNextState1 := FStateCount+1
  389. else
  390. sdNextState1 := aNextState1;
  391. sdNextState2 := aNextState2;
  392. sdMatchType := aMatchType;
  393. if (aMatchType = mtChar) then
  394. sdChar := aChar
  395. else if (aMatchType = mtClass) or (aMatchType = mtNegClass) then
  396. sdClass := aCharClass;
  397. end;
  398. Result := FStateCount;
  399. inc(FStateCount);
  400. if FStateCount=length(FStateTable) then
  401. setlength(FStateTable,(FStateCount * 3) div 2);
  402. if not (aMatchType in [mtChar,mtTerminal,mtNone]) then FRegexType := rtRegEx;
  403. end;
  404. {--------}
  405. procedure TRegexEngine.rcClear;
  406. var
  407. i, j : integer;
  408. begin
  409. {free all items in the state transition table}
  410. for i := 0 to FStateCount-1 do begin
  411. with FStateTable[i] do begin
  412. if (sdMatchType = mtClass) or
  413. (sdMatchType = mtNegClass) and
  414. (sdClass <> nil) then
  415. begin
  416. for j := i+1 to FStateCount-1 do
  417. if (FStateTable[j].sdClass = sdClass) then
  418. FStateTable[j].sdClass := nil;
  419. FreeMem(sdClass, sizeof(TCharSet));
  420. end;
  421. // I am not sure if the next line is necessary. rcAddState set all values, so
  422. // it shouldn't be necessary to clear its contents?
  423. // FillChar(FStateTable[i],SizeOf(FStateTable[i]),#0);
  424. end;
  425. end;
  426. {clear the state transition table}
  427. FStateCount:=0;
  428. FAnchorStart := false;
  429. FAnchorEnd := false;
  430. end;
  431. {--------}
  432. procedure TRegexEngine.rcError(aIndex : integer);
  433. begin
  434. raise Exception.Create(Format(eRegexParseError,[aIndex]));
  435. end;
  436. {--------}
  437. procedure TRegexEngine.rcLevel1Optimize;
  438. var
  439. i : integer;
  440. Walker : integer;
  441. begin
  442. {level 1 optimization removes all states that have only a single
  443. no-cost move to another state}
  444. {cycle through all the state records, except for the last one}
  445. for i := 0 to FStateCount - 2 do begin
  446. {get this state}
  447. with FStateTable[i] do begin
  448. {walk the chain pointed to by the first next state, unlinking
  449. the states that are simple single no-cost moves}
  450. Walker := sdNextState1;
  451. while (FStateTable[walker].sdMatchType = mtNone) and
  452. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  453. sdNextState1 := FStateTable[walker].sdNextState1;
  454. Walker := sdNextState1;
  455. end;
  456. {walk the chain pointed to by the second next state, unlinking
  457. the states that are simple single no-cost moves}
  458. if (sdNextState2 <> UnusedState) then begin
  459. Walker := sdNextState2;
  460. while (FStateTable[walker].sdMatchType = mtNone) and
  461. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  462. sdNextState2 := FStateTable[walker].sdNextState1;
  463. Walker := sdNextState2;
  464. end;
  465. end;
  466. end;
  467. end;
  468. {cycle through all the state records, except for the last one,
  469. marking unused ones--not strictly necessary but good for debugging}
  470. for i := 0 to FStateCount - 2 do begin
  471. with FStateTable[i] do begin
  472. if (sdMatchType = mtNone) and
  473. (sdNextState2 = UnusedState) then
  474. sdMatchType := mtUnused;
  475. end;
  476. end;
  477. end;
  478. {--------}
  479. function TRegexEngine.rcMatchSubString(const s : string;
  480. StartPosn : integer;
  481. var Len : integer)
  482. : boolean;
  483. var
  484. Ch : AnsiChar;
  485. State : integer;
  486. StrInx : integer;
  487. begin
  488. {assume we fail to match}
  489. Result := false;
  490. Len := StartPosn;
  491. {clear the deque}
  492. FHead := FCapacity div 2;
  493. FTail := FHead;
  494. {enqueue the special value to start scanning}
  495. DequeEnqueue(MustScan);
  496. {enqueue the first state}
  497. DequeEnqueue(FStartState);
  498. {prepare the string index}
  499. StrInx := StartPosn;
  500. {loop until the deque is empty or we run out of string}
  501. repeat
  502. {pop the top state from the deque}
  503. State := DequePop;
  504. {process the "must scan" state first}
  505. if (State = MustScan) then begin
  506. {if the deque is empty at this point, we might as well give up
  507. since there are no states left to process new characters}
  508. if (FHead <> FTail) then begin
  509. {if we haven't run out of string, get the character, and
  510. enqueue the "must scan" state again}
  511. if IgnoreCase then
  512. Ch := Upcase(s[StrInx])
  513. else
  514. Ch := s[StrInx];
  515. DequeEnqueue(MustScan);
  516. inc(StrInx);
  517. end;
  518. end
  519. {otherwise, process the state}
  520. else with FStateTable[State] do begin
  521. case sdMatchType of
  522. mtChar :
  523. begin
  524. {for a match of a character, enqueue the next state}
  525. if (Ch = sdChar) then
  526. DequeEnqueue(sdNextState1);
  527. end;
  528. mtAnyChar :
  529. begin
  530. {for a match of any character, enqueue the next state}
  531. if not (Ch in newline) then
  532. DequeEnqueue(sdNextState1);
  533. end;
  534. mtClass :
  535. begin
  536. {for a match within a class, enqueue the next state}
  537. if (Ch in sdClass^) then
  538. DequeEnqueue(sdNextState1);
  539. end;
  540. mtNegClass :
  541. begin
  542. {for a match not within a class, enqueue the next state}
  543. if not (Ch in sdClass^) then
  544. DequeEnqueue(sdNextState1);
  545. end;
  546. mtTerminal :
  547. begin
  548. {for a terminal state, the string successfully matched
  549. if the regex had no end anchor, or we're at the end
  550. of the string or line}
  551. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  552. Result := true;
  553. Len := StrInx-1;
  554. // Exit;
  555. end;
  556. end;
  557. mtNone :
  558. begin
  559. {for free moves, push the next states onto the deque}
  560. Assert(sdNextState2 <> UnusedState,
  561. 'optimization should remove all states with one no-cost move');
  562. DequePush(sdNextState2);
  563. DequePush(sdNextState1);
  564. end;
  565. mtUnused :
  566. begin
  567. Assert(false, 'unused states shouldn''t be seen');
  568. end;
  569. end;
  570. end;
  571. until (FHead = FTail) or (ch = #0); // deque empty or end of string
  572. {if we reach this point we've either exhausted the deque or we've
  573. run out of string; if the former, the substring did not match
  574. since there are no more states. If the latter, we need to check
  575. the states left on the deque to see if one is the terminating
  576. state; if so the string matched the regular expression defined by
  577. the transition table}
  578. while (FHead <> FTail) do begin
  579. State := DequePop;
  580. with FStateTable[State] do begin
  581. case sdMatchType of
  582. mtNone :
  583. begin
  584. {for free moves, push the next states onto the deque}
  585. Assert(sdNextState2 <> UnusedState,
  586. 'optimization should remove all states with one no-cost move');
  587. DequePush(sdNextState2);
  588. DequePush(sdNextState1);
  589. end;
  590. mtTerminal :
  591. begin
  592. {for a terminal state, the string successfully matched
  593. if the regex had no end anchor, or we're at the end
  594. of the string or line}
  595. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  596. Result := true;
  597. Len := StrInx -1;
  598. Exit;
  599. end;
  600. end;
  601. end;{case}
  602. end;
  603. end;
  604. end;
  605. {--------}
  606. function TRegexEngine.rcParseAnchorExpr : integer;
  607. begin
  608. {check for an initial '^'}
  609. if (FPosn^ = '^') then begin
  610. FAnchorStart := true;
  611. inc(FPosn);
  612. end;
  613. {parse an expression}
  614. Result := rcParseExpr;
  615. {if we were successful, check for the final '$'}
  616. if (Result <> ErrorState) then begin
  617. if (FPosn^ = '$') then begin
  618. FAnchorEnd := true;
  619. inc(FPosn);
  620. end;
  621. end;
  622. end;
  623. {--------}
  624. function TRegexEngine.rcParseAtom : integer;
  625. var
  626. MatchType : TNFAMatchType;
  627. CharClass : PCharSet;
  628. begin
  629. case FPosn^ of
  630. '(' :
  631. begin
  632. {move past the open parenthesis}
  633. inc(FPosn);
  634. {parse a complete regex between the parentheses}
  635. Result := rcParseExpr;
  636. if (Result = ErrorState) then
  637. Exit;
  638. {if the current character is not a close parenthesis,
  639. there's an error}
  640. if (FPosn^ <> ')') then begin
  641. FErrorCode := recNoCloseParen;
  642. Result := ErrorState;
  643. Exit;
  644. end;
  645. {move past the close parenthesis}
  646. inc(FPosn);
  647. {always handle expressions with parentheses as regular-expression}
  648. FRegexType := rtRegEx;
  649. end;
  650. '[' :
  651. begin
  652. {move past the open square bracket}
  653. inc(FPosn);
  654. {if the first character in the class is a '^' then the
  655. class if negated, otherwise it's a normal one}
  656. if (FPosn^ = '^') then begin
  657. inc(FPosn);
  658. MatchType := mtNegClass;
  659. end
  660. else begin
  661. MatchType := mtClass;
  662. end;
  663. {allocate the class character set and parse the character
  664. class; this will return either with an error, or when the
  665. closing square bracket is encountered}
  666. New(CharClass);
  667. CharClass^ := [];
  668. if not rcParseCharClass(CharClass) then begin
  669. Dispose(CharClass);
  670. Result := ErrorState;
  671. Exit;
  672. end;
  673. {move past the closing square bracket}
  674. Assert(FPosn^ = ']',
  675. 'the rcParseCharClass terminated without finding a "]"');
  676. inc(FPosn);
  677. {add a new state for the character class}
  678. Result := rcAddState(MatchType, #0, CharClass,
  679. NewFinalState, UnusedState);
  680. end;
  681. '.' :
  682. begin
  683. {move past the period metacharacter}
  684. inc(FPosn);
  685. {add a new state for the 'any character' token}
  686. Result := rcAddState(mtAnyChar, #0, nil,
  687. NewFinalState, UnusedState);
  688. end;
  689. '\' :
  690. begin
  691. if (FPosn+1)^ in ['d','D','s','S','w','W'] then begin
  692. New(CharClass);
  693. CharClass^ := [];
  694. if not rcParseCharRange(CharClass) then begin
  695. Dispose(CharClass);
  696. Result := ErrorState;
  697. Exit;
  698. end;
  699. Result := rcAddState(mtClass, #0, CharClass,
  700. NewFinalState, UnusedState);
  701. end
  702. else
  703. Result := rcParseChar;
  704. end;
  705. else
  706. {otherwise parse a single character}
  707. Result := rcParseChar;
  708. end;{case}
  709. end;
  710. {--------}
  711. function TRegexEngine.rcParseCCChar(out EscapeChar : Boolean) : AnsiChar;
  712. begin
  713. EscapeChar:=False;
  714. {if we hit the end of the string, it's an error}
  715. if (FPosn^ = #0) then begin
  716. FErrorCode := recSuddenEnd;
  717. Result := #0;
  718. Exit;
  719. end;
  720. {if the current char is a metacharacter (at least in terms of a
  721. character class), it's an error}
  722. if FPosn^ in [']', '-'] then begin
  723. FErrorCode := recMetaChar;
  724. Result := #0;
  725. Exit;
  726. end;
  727. {otherwise return the character and advance past it}
  728. if (FPosn^ = '\') then
  729. {..it's an escaped character: get the next character instead}
  730. begin
  731. inc(FPosn);
  732. EscapeChar:=True;
  733. Result := rcReturnEscapeChar;
  734. end
  735. else
  736. Result := FPosn^;
  737. inc(FPosn);
  738. end;
  739. {--------}
  740. function TRegexEngine.rcParseChar : integer;
  741. var
  742. Ch : AnsiChar;
  743. begin
  744. {if we hit the end of the string, it's an error}
  745. if (FPosn^ = #0) then begin
  746. Result := ErrorState;
  747. FErrorCode := recSuddenEnd;
  748. Exit;
  749. end;
  750. {if the current char is one of the metacharacters, it's an error}
  751. if FPosn^ in MetaCharacters then begin
  752. Result := ErrorState;
  753. FErrorCode := recMetaChar;
  754. Exit;
  755. end;
  756. {otherwise add a state for the character}
  757. {..if it's an escaped character: get the next character instead}
  758. if (FPosn^ = '\') then
  759. begin
  760. inc(FPosn);
  761. ch := rcReturnEscapeChar;
  762. FRegexType := rtRegEx;
  763. end
  764. else
  765. ch :=FPosn^;
  766. if IgnoreCase then
  767. Ch := Upcase(ch);
  768. Result := rcAddState(mtChar, Ch, nil, NewFinalState, UnusedState);
  769. inc(FPosn);
  770. end;
  771. {--------}
  772. function TRegexEngine.rcParseCharClass(aClass : PCharSet) : boolean;
  773. begin
  774. {assume we can't parse a character class properly}
  775. Result := false;
  776. {parse a character range; if we can't there was an error and the
  777. caller will take care of it}
  778. if not rcParseCharRange(aClass) then
  779. Exit;
  780. {if the current character was not the right bracket, parse another
  781. character class (note: we're removing the tail recursion here)}
  782. while (FPosn^ <> ']') do begin
  783. if not rcParseCharRange(aClass) then
  784. Exit;
  785. end;
  786. {if we reach here we were successful}
  787. Result := true;
  788. end;
  789. {--------}
  790. function TRegexEngine.rcParseCharRange(aClass : PCharSet) : boolean;
  791. var
  792. StartChar : AnsiChar;
  793. EndChar : AnsiChar;
  794. Ch : AnsiChar;
  795. EscChar : Boolean;
  796. begin
  797. {assume we can't parse a character range properly}
  798. Result := false;
  799. {parse a single character; if it's null there was an error}
  800. StartChar := rcParseCCChar(EscChar);
  801. if (StartChar = #0) then
  802. Exit;
  803. if EscChar then
  804. begin
  805. case StartChar of
  806. 'd' : aClass^ := aClass^ + cs_digits;
  807. 'D' : aClass^ := aClass^ + cs_allchars-cs_digits;
  808. 's' : aClass^ := aClass^ + cs_whitespace;
  809. 'S' : aClass^ := aClass^ + cs_allchars-cs_whitespace;
  810. 'w' : aClass^ := aClass^ + cs_wordchars;
  811. 'W' : aClass^ := aClass^ + cs_allchars-cs_wordchars
  812. else
  813. EscChar := False;
  814. end;
  815. if EscChar then
  816. begin
  817. Result := True;
  818. Exit;
  819. end;
  820. end;
  821. {if the current character is not a dash, the range consisted of a
  822. single character}
  823. if (FPosn^ <> '-') then begin
  824. if IgnoreCase then
  825. Include(aClass^, Upcase(StartChar))
  826. else
  827. Include(aClass^, StartChar)
  828. end
  829. {otherwise it's a real range, so get the character at the end of the
  830. range; if that's null, there was an error}
  831. else begin
  832. inc(FPosn); {move past the '-'}
  833. EndChar := rcParseCCChar(EscChar);
  834. if (EndChar = #0) then
  835. Exit;
  836. {build the range as a character set}
  837. if (StartChar > EndChar) then begin
  838. Ch := StartChar;
  839. StartChar := EndChar;
  840. EndChar := Ch;
  841. end;
  842. for Ch := StartChar to EndChar do begin
  843. Include(aClass^, Ch);
  844. if IgnoreCase then
  845. Include(aClass^, Upcase(Ch));
  846. end;
  847. end;
  848. {if we reach here we were successful}
  849. Result := true;
  850. end;
  851. {--------}
  852. function TRegexEngine.rcParseExpr : integer;
  853. var
  854. StartState1 : integer;
  855. StartState2 : integer;
  856. EndState1 : integer;
  857. OverallStartState : integer;
  858. begin
  859. {assume the worst}
  860. Result := ErrorState;
  861. {parse an initial term}
  862. StartState1 := rcParseTerm;
  863. if (StartState1 = ErrorState) then
  864. Exit;
  865. {if the current character is *not* a pipe character, no alternation
  866. is present so return the start state of the initial term as our
  867. start state}
  868. if (FPosn^ <> '|') then
  869. Result := StartState1
  870. {otherwise, we need to parse another expr and join the two together
  871. in the transition table}
  872. else begin
  873. {advance past the pipe}
  874. inc(FPosn);
  875. {the initial term's end state does not exist yet (although there
  876. is a state in the term that points to it), so create it}
  877. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  878. {for the OR construction we need a new initial state: it will
  879. point to the initial term and the second just-about-to-be-parsed
  880. expr}
  881. OverallStartState := rcAddState(mtNone, #0, nil,
  882. UnusedState, UnusedState);
  883. {parse another expr}
  884. StartState2 := rcParseExpr;
  885. if (StartState2 = ErrorState) then
  886. Exit;
  887. {alter the state state for the overall expr so that the second
  888. link points to the start of the second expr}
  889. Result := rcSetState(OverallStartState, StartState1, StartState2);
  890. {now set the end state for the initial term to point to the final
  891. end state for the second expr and the overall expr}
  892. rcSetState(EndState1, FStateCount, UnusedState);
  893. {always handle expressions with a pipe as regular-expression}
  894. FRegexType := rtRegEx;
  895. end;
  896. end;
  897. {--------}
  898. function TRegexEngine.rcParseFactor : integer;
  899. var
  900. StartStateAtom : integer;
  901. EndStateAtom : integer;
  902. TempEndStateAtom : integer;
  903. Int : string;
  904. n,m,nState : integer;
  905. i : integer;
  906. begin
  907. {assume the worst}
  908. Result := ErrorState;
  909. {first parse an atom}
  910. StartStateAtom := rcParseAtom;
  911. if (StartStateAtom = ErrorState) then
  912. Exit;
  913. {check for a closure operator}
  914. case FPosn^ of
  915. '?' : begin
  916. {move past the ? operator}
  917. inc(FPosn);
  918. {the atom's end state doesn't exist yet, so create one}
  919. EndStateAtom := rcAddState(mtNone, #0, nil,
  920. UnusedState, UnusedState);
  921. {create a new start state for the overall regex}
  922. Result := rcAddState(mtNone, #0, nil,
  923. StartStateAtom, EndStateAtom);
  924. {make sure the new end state points to the next unused
  925. state}
  926. rcSetState(EndStateAtom, FStateCount, UnusedState);
  927. end;
  928. '*' : begin
  929. {move past the * operator}
  930. inc(FPosn);
  931. {the atom's end state doesn't exist yet, so create one;
  932. it'll be the start of the overall regex subexpression}
  933. Result := rcAddState(mtNone, #0, nil,
  934. NewFinalState, StartStateAtom);
  935. end;
  936. '+' : begin
  937. {move past the + operator}
  938. inc(FPosn);
  939. {the atom's end state doesn't exist yet, so create one}
  940. rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom);
  941. {the start of the overall regex subexpression will be the
  942. atom's start state}
  943. Result := StartStateAtom;
  944. end;
  945. '{' : begin // {n,m}
  946. {move past the brace }
  947. inc(FPosn);
  948. {Parse the value of n}
  949. Int := '';
  950. while not (FPosn^ in [',','}',#0]) do
  951. begin
  952. int := int+FPosn^;
  953. inc(FPosn);
  954. end;
  955. if FPosn^ = #0 then exit; // No end-brace or comma -> invalid regex
  956. if int <> '' then
  957. n := StrToIntDef(Int,-2)
  958. else
  959. n := -1; // if n is 'empty', set it to -1
  960. if n = -2 then exit; // Invalid value for n -> invalid RegEx
  961. if FPosn^ <> '}' then
  962. begin
  963. {move past the , }
  964. inc(FPosn);
  965. {Parse the value of m}
  966. Int := '';
  967. while not (FPosn^ in ['}',#0]) do
  968. begin
  969. int := int+FPosn^;
  970. inc(FPosn);
  971. end;
  972. if FPosn^ <> '}' then exit; // No end-brace -> invalid regex
  973. if int <> '' then m := StrToIntDef(Int,-2)
  974. else m := -1;
  975. if m = -2 then exit; // Invalid RegEx
  976. end
  977. else
  978. m := -3;
  979. {move past the brace }
  980. inc(FPosn);
  981. if (n=0) and (m=-1) then
  982. {the atom's end state doesn't exist yet, so create one;
  983. it'll be the start of the overall regex subexpression}
  984. Result := rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom)
  985. else
  986. begin
  987. EndStateAtom := FStateCount-1;
  988. TempEndStateAtom:=StartStateAtom;
  989. for i := 1 to n-1 do
  990. begin
  991. TempEndStateAtom:=FStateCount;
  992. for nState:=StartStateAtom to EndStateAtom do
  993. begin
  994. FStateTable[FStateCount]:=FStateTable[nState];
  995. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  996. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) *i;
  997. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  998. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) *i;
  999. inc(FStateCount);
  1000. if FStateCount=length(FStateTable) then
  1001. setlength(FStateTable,(FStateCount * 3) div 2);
  1002. end;
  1003. end;
  1004. for i := n to m-1 do
  1005. begin
  1006. rcAddState(mtNone, #0, nil, NewFinalState, EndStateAtom+(EndStateAtom-StartStateAtom+1) * (m-1) + (m-n)+1);
  1007. TempEndStateAtom:=FStateCount;
  1008. for nState:=StartStateAtom to EndStateAtom do
  1009. begin
  1010. FStateTable[FStateCount]:=FStateTable[nState];
  1011. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  1012. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) * i+(i-n+1);
  1013. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  1014. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) * i+(i-n+1);
  1015. inc(FStateCount);
  1016. if FStateCount=length(FStateTable) then
  1017. setlength(FStateTable,(FStateCount * 3) div 2);
  1018. end;
  1019. end;
  1020. if m = -1 then
  1021. rcAddState(mtNone, #0, nil, NewFinalState, TempEndStateAtom);
  1022. Result := StartStateAtom;
  1023. end;
  1024. {always handle expressions with braces as regular-expression}
  1025. FRegexType := rtRegEx;
  1026. end;
  1027. else
  1028. Result := StartStateAtom;
  1029. end;{case}
  1030. end;
  1031. {--------}
  1032. function TRegexEngine.rcParseTerm : integer;
  1033. var
  1034. StartState2 : integer;
  1035. EndState1 : integer;
  1036. begin
  1037. {parse an initial factor, the state number returned will also be our
  1038. return state number}
  1039. Result := rcParseFactor;
  1040. if (Result = ErrorState) then
  1041. Exit;
  1042. {Note: we have to "break the grammar" here. We've parsed a regular
  1043. subexpression and we're possibly following on with another
  1044. regular subexpression. There's no nice operator to key off
  1045. for concatenation: we just have to know that for
  1046. concatenating two subexpressions, the current character will
  1047. be
  1048. - an open parenthesis
  1049. - an open square bracket
  1050. - an any char operator
  1051. - a character that's not a metacharacter
  1052. i.e., the three possibilities for the start of an "atom" in
  1053. our grammar}
  1054. if (FPosn^ = '(') or
  1055. (FPosn^ = '[') or
  1056. (FPosn^ = '.') or
  1057. ((FPosn^ <> #0) and not (FPosn^ in MetaCharacters)) then begin
  1058. {the initial factor's end state does not exist yet (although there
  1059. is a state in the term that points to it), so create it}
  1060. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  1061. {parse another term}
  1062. StartState2 := rcParseTerm;
  1063. if (StartState2 = ErrorState) then begin
  1064. Result := ErrorState;
  1065. Exit;
  1066. end;
  1067. {join the first factor to the second term}
  1068. rcSetState(EndState1, StartState2, UnusedState);
  1069. end;
  1070. end;
  1071. procedure TRegexEngine.WriteTable;
  1072. var i : integer;
  1073. begin
  1074. for i := 0 to FStateCount-1 do with FStateTable[i] do
  1075. writeln('s:',i,' mt:',sdMatchType ,' ns1:',sdNextState1,' ns2:',sdNextState2,' char:',sdChar);
  1076. end;
  1077. procedure TRegexEngine.DequeEnqueue(aValue: integer);
  1078. begin
  1079. FList[FTail] := aValue;
  1080. inc(FTail);
  1081. if (FTail = FCapacity) then
  1082. FTail := 0
  1083. else if (FTail = FHead) then
  1084. DequeGrow;
  1085. end;
  1086. procedure TRegexEngine.DequePush(aValue: integer);
  1087. begin
  1088. if (FHead = 0) then
  1089. FHead := FCapacity;
  1090. dec(FHead);
  1091. FList[FHead] := aValue;
  1092. if (FTail = FHead) then
  1093. DequeGrow;
  1094. end;
  1095. function TRegexEngine.DequePop: integer;
  1096. begin
  1097. Result := FList[FHead];
  1098. inc(FHead);
  1099. if (FHead = FCapacity) then
  1100. FHead := 0;
  1101. end;
  1102. procedure TRegexEngine.DequeGrow;
  1103. var
  1104. OldCount : integer;
  1105. i, j : integer;
  1106. begin
  1107. {grow the list by 50%}
  1108. OldCount := FCapacity;
  1109. FCapacity:=(OldCount * 3) div 2;
  1110. SetLength(FList,FCapacity);
  1111. {expand the data into the increased space, maintaining the deque}
  1112. if (FHead = 0) then
  1113. FTail := OldCount
  1114. else begin
  1115. j := FCapacity;
  1116. for i := pred(OldCount) downto FHead do begin
  1117. dec(j);
  1118. FList[j] := FList[i]
  1119. end;
  1120. FHead := j;
  1121. end;
  1122. end;
  1123. function TRegexEngine.rcReturnEscapeChar: AnsiChar;
  1124. begin
  1125. case FPosn^ of
  1126. 't' : Result := #9;
  1127. 'n' : Result := #10;
  1128. 'r' : Result := #13;
  1129. 'f' : Result := #12;
  1130. 'a' : Result := #7;
  1131. else
  1132. Result := FPosn^;
  1133. end;
  1134. end;
  1135. {--------}
  1136. procedure TRegexEngine.rcSetIgnoreCase(aValue : boolean);
  1137. begin
  1138. if (aValue <> FIgnoreCase) then begin
  1139. rcClear;
  1140. FIgnoreCase := aValue;
  1141. end;
  1142. end;
  1143. {--------}
  1144. procedure TRegexEngine.rcSetRegexStr(const aRegexStr : string);
  1145. begin
  1146. if (aRegexStr <> FRegexStr) then begin
  1147. rcClear;
  1148. FRegexStr := aRegexStr;
  1149. end;
  1150. end;
  1151. {--------}
  1152. function TRegexEngine.rcSetState(aState : integer;
  1153. aNextState1: integer;
  1154. aNextState2: integer) : integer;
  1155. begin
  1156. Assert((0 <= aState) and (aState < FStateCount),
  1157. 'trying to change an invalid state');
  1158. {get the state record and change the transition information}
  1159. FStateTable[aState].sdNextState1 := aNextState1;
  1160. FStateTable[aState].sdNextState2 := aNextState2;
  1161. Result := aState;
  1162. end;
  1163. {--------}
  1164. procedure TRegexEngine.rcSetUpcase(aValue : TUpcaseFunc);
  1165. begin
  1166. if not Assigned(aValue) then
  1167. FUpcase := SystemUpcase
  1168. else
  1169. FUpcase := aValue;
  1170. end;
  1171. procedure TRegexEngine.rcSetMultiLine(aValue: Boolean);
  1172. begin
  1173. FMultiLine:=aValue;
  1174. end;
  1175. {====================================================================}
  1176. end.