regex.pp 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230
  1. {
  2. This file is part of the Free Pascal packages library.
  3. Copyright (c) 2008 by Joost van der Sluis, member of the
  4. Free Pascal development team
  5. Regexpression parser
  6. This code is based on the examples in the book
  7. 'Tomes of Delphi: Algorithms and Data Structures' by Julian M Bucknall
  8. The code is used with his permission. For an excellent explanation of
  9. this unit, see the book...
  10. See the file COPYING.FPC, included in this distribution,
  11. for details about the copyright.
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. **********************************************************************}
  16. unit Regex;
  17. {$mode Delphi}{$H+}
  18. {$INLINE ON}
  19. interface
  20. {Notes:
  21. these classes parse regular expressions that follow this grammar:
  22. <anchorexpr> ::= <expr> |
  23. '^' <expr> |
  24. <expr> '$' |
  25. '^' <expr> '$'
  26. <expr> ::= <term> |
  27. <term> '|' <expr> - alternation
  28. <term> ::= <factor> |
  29. <factor><term> - concatenation
  30. <factor> ::= <atom> |
  31. <atom> '?' | - zero or one
  32. <atom> '*' | - zero or more
  33. <atom> 'n,m' | - min n, max m (added by Joost)
  34. <atom> '+' - one or more
  35. <atom> ::= <char> |
  36. '.' | - any char
  37. '(' <expr> ') | - parentheses
  38. '[' <charclass> ']' | - normal class
  39. '[^' <charclass> ']' - negated class
  40. <charclass> ::= <charrange> |
  41. <charrange><charclass>
  42. <charrange> ::= <ccchar> |
  43. <ccchar> '-' <ccchar>
  44. <char> ::= <any character except metacharacters> |
  45. '\' <any character at all>
  46. <ccchar> ::= <any character except '-' and ']'> |
  47. '\' <any character at all>
  48. This means that parentheses have maximum precedence, followed
  49. by square brackets, followed by the closure operators,
  50. followed by concatenation, finally followed by alternation.
  51. }
  52. uses
  53. SysUtils,
  54. Classes;
  55. type
  56. TUpcaseFunc = function(aCh : AnsiChar) : AnsiChar;
  57. TNFAMatchType = ( {types of matching performed...}
  58. mtNone, {..no match (an epsilon no-cost move)}
  59. mtAnyChar, {..any character}
  60. mtChar, {..a particular character}
  61. mtClass, {..a character class}
  62. mtNegClass, {..a negated character class}
  63. mtTerminal, {..the final state--no matching}
  64. mtUnused); {..an unused state--no matching}
  65. TRegexError = ( {error codes for invalid regex strings}
  66. recNone, {..no error}
  67. recSuddenEnd, {..unexpected end of string}
  68. recMetaChar, {..read metacharacter, but needed normal char}
  69. recNoCloseParen, {..expected close paren, but not there}
  70. recExtraChars {..not at end of string after parsing regex}
  71. );
  72. TRegexType = (
  73. rtRegEx,
  74. rtChars,
  75. rtSingleChar
  76. );
  77. PCharSet = ^TCharSet;
  78. TCharSet = set of Char;
  79. { TtdRegexEngine }
  80. TNFAState = record
  81. sdNextState1: integer;
  82. sdNextState2: integer;
  83. sdClass : PCharSet;
  84. sdMatchType : TNFAMatchType;
  85. sdChar : AnsiChar;
  86. end;
  87. { TRegexEngine }
  88. TRegexEngine = class
  89. private
  90. FAnchorEnd : boolean;
  91. FAnchorStart: boolean;
  92. FErrorCode : TRegexError;
  93. FIgnoreCase : boolean;
  94. FMultiLine : boolean;
  95. FPosn : PAnsiChar;
  96. FRegexStr : string;
  97. FStartState : integer;
  98. FStateTable : Array of TNFAState;
  99. FStateCount : integer;
  100. FUpcase : TUpcaseFunc;
  101. // The deque (double-ended queue)
  102. FList : array of integer;
  103. FCapacity : integer;
  104. FHead : integer;
  105. FTail : integer;
  106. FRegexType : TRegexType;
  107. protected
  108. procedure DequeEnqueue(aValue : integer);
  109. procedure DequePush(aValue : integer);
  110. function DequePop : integer;
  111. procedure DequeGrow;
  112. procedure rcSetIgnoreCase(aValue : boolean); virtual;
  113. procedure rcSetRegexStr(const aRegexStr : string); virtual;
  114. procedure rcSetUpcase(aValue : TUpcaseFunc); virtual;
  115. procedure rcSetMultiLine(aValue : Boolean); virtual;
  116. procedure rcClear; virtual;
  117. procedure rcError(aIndex : integer); virtual;
  118. procedure rcLevel1Optimize; virtual;
  119. function rcMatchSubString(const S : string;
  120. StartPosn : integer;
  121. var Len : integer) : boolean; virtual;
  122. function rcAddState(aMatchType : TNFAMatchType;
  123. aChar : AnsiChar;
  124. aCharClass : PCharSet;
  125. aNextState1: integer;
  126. aNextState2: integer) : integer;
  127. function rcSetState(aState : integer;
  128. aNextState1: integer;
  129. aNextState2: integer) : integer;
  130. function rcParseAnchorExpr : integer; virtual;
  131. function rcParseAtom : integer; virtual;
  132. function rcParseCCChar(out EscapeChar : Boolean) : AnsiChar; virtual;
  133. function rcParseChar : integer; virtual;
  134. function rcParseCharClass(aClass : PCharSet) : boolean; virtual;
  135. function rcParseCharRange(aClass : PCharSet) : boolean; virtual;
  136. function rcParseExpr : integer; virtual;
  137. function rcParseFactor : integer; virtual;
  138. function rcParseTerm : integer; virtual;
  139. Function rcReturnEscapeChar : AnsiChar; virtual;
  140. public
  141. procedure WriteTable;
  142. constructor Create(const aRegexStr : string);
  143. destructor Destroy; override;
  144. function Parse(var aErrorPos : integer;
  145. var aErrorCode: TRegexError) : boolean; virtual;
  146. function MatchString(const S : string; out MatchPos : integer; var Offset : integer) : boolean; virtual;
  147. function ReplaceAllString(const src, newstr: ansistring; out DestStr : string): Integer;
  148. property IgnoreCase : boolean
  149. read FIgnoreCase write rcSetIgnoreCase;
  150. property MultiLine : boolean
  151. read FMultiLine write rcSetMultiLine;
  152. property RegexString : string
  153. read FRegexStr write rcSetRegexStr;
  154. property Upcase : TUpcaseFunc
  155. read FUpcase write rcSetUpcase;
  156. end;
  157. Resourcestring
  158. eRegexParseError = 'Error at %d when parsing regular expression';
  159. implementation
  160. uses strutils;
  161. const
  162. MetaCharacters : set of AnsiChar =
  163. ['[', ']', '(', ')', '|', '*', '+', '?', '-', '.',
  164. '^', '$', '{', '}'];
  165. newline : TCharSet = [#10,#13,#$85];
  166. {some handy constants}
  167. UnusedState = -1;
  168. NewFinalState = -2;
  169. CreateNewState = -3;
  170. ErrorState = -4;
  171. MustScan = -5;
  172. cs_allchars : tcharset = [#0..#255];
  173. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  174. cs_newline : tcharset = [#10];
  175. cs_digits : tcharset = ['0'..'9'];
  176. cs_whitespace : tcharset = [' ',#9];
  177. {===Helper routines==================================================}
  178. function SystemUpcase(aCh : AnsiChar) : AnsiChar; far;
  179. begin
  180. Result := System.Upcase(aCh);
  181. end;
  182. {====================================================================}
  183. {===TRegexEngine===================================================}
  184. constructor TRegexEngine.Create(const aRegexStr : string);
  185. begin
  186. inherited Create;
  187. FRegexStr := aRegexStr;
  188. FIgnoreCase := false;
  189. FUpcase := SystemUpcase;
  190. SetLength(FStateTable,64);
  191. FStateCount:=0;
  192. FCapacity:=64;
  193. setlength(FList,FCapacity);
  194. {let's help out the user of the deque by putting the head and
  195. tail pointers in the middle: it's probably more efficient}
  196. FHead := FCapacity div 2;
  197. FTail := FHead;
  198. MultiLine:=False;
  199. end;
  200. {--------}
  201. destructor TRegexEngine.Destroy;
  202. begin
  203. if (FStateTable <> nil) then
  204. rcClear;
  205. inherited Destroy;
  206. end;
  207. {--------}
  208. function TRegexEngine.MatchString(const S : string; out MatchPos : integer; var Offset : integer): boolean;
  209. var
  210. i : integer;
  211. ErrorPos : integer;
  212. ErrorCode : TRegexError;
  213. pc : pchar;
  214. x:integer;
  215. begin
  216. {if the regex string hasn't been parsed yet, do so}
  217. if (FStateCount = 0) then begin
  218. if not Parse(ErrorPos, ErrorCode) then
  219. rcError(ErrorPos);
  220. end;
  221. case FRegexType of
  222. rtSingleChar :
  223. begin
  224. MatchPos := PosEx(char(FRegexStr[1]),s,Offset);
  225. Offset := MatchPos+1;
  226. Result := (MatchPos>0);
  227. end;
  228. rtChars :
  229. begin
  230. MatchPos := PosEx(FRegexStr,s,Offset);
  231. Offset := MatchPos+length(FRegexStr);
  232. Result := (MatchPos>0);
  233. end
  234. else
  235. begin
  236. {now try and see if the string matches (empty strings don't)}
  237. Result := False;
  238. MatchPos := 0;
  239. if (S <> '') then
  240. {if the regex specified a start anchor then we
  241. need to check the string starting at the first position}
  242. if FAnchorStart then begin
  243. if rcMatchSubString(S, 1, Offset) then
  244. begin
  245. MatchPos:=1;
  246. Result := True;
  247. end
  248. {If the first position did not match ang MultiLine is false, the string
  249. doesn't match. If MultiLine is true, start at every position after a
  250. newline }
  251. else if FMultiLine then begin
  252. for i := Offset to length(S)-1 do
  253. if S[i] in newline then
  254. if rcMatchSubString(S, i+1, Offset) then begin
  255. MatchPos := i+1;
  256. Result := True;
  257. Break;
  258. end;
  259. end
  260. end
  261. {otherwise we try and match the string at every position and
  262. return at the first success}
  263. else begin
  264. for i := Offset to length(S) do
  265. if rcMatchSubString(S, i, Offset) then begin
  266. MatchPos:=i;
  267. Result := True;
  268. Break;
  269. end;
  270. end;
  271. end;
  272. end; {case}
  273. end;
  274. function TRegexEngine.ReplaceAllString(const src, newstr: ansistring; out DestStr : string): Integer;
  275. type TReplRec = record
  276. Pos : integer;
  277. Len : integer;
  278. end;
  279. var ofs : Integer;
  280. size_newstr,
  281. size, pos : Integer;
  282. ReplArr : array of TReplRec;
  283. racount : integer;
  284. MatchPos : integer;
  285. DestSize : integer;
  286. LastPos : integer;
  287. MoveLen : integer;
  288. i : integer;
  289. begin
  290. setlength(ReplArr,64);
  291. racount := 0;
  292. DestSize:=length(src);
  293. size_newstr := length(newstr);
  294. Ofs := 1;
  295. while MatchString(src,MatchPos,Ofs) do
  296. begin
  297. if racount = length(ReplArr) then
  298. setlength(ReplArr,racount+racount div 2);
  299. ReplArr[racount].Pos := MatchPos;
  300. ReplArr[racount].Len := ofs;
  301. DestSize:=DestSize-ofs+MatchPos+size_newstr;
  302. inc(racount);
  303. end;
  304. SetLength(DestStr, SizeOf(Char)*DestSize);
  305. MatchPos:=1; LastPos:=1;
  306. if size_newstr<>0 then for i := 0 to racount -1 do
  307. begin
  308. MoveLen := ReplArr[i].Pos-LastPos;
  309. move(src[LastPos],DestStr[MatchPos],MoveLen);
  310. MatchPos:=MatchPos+MoveLen;
  311. LastPos := ReplArr[i].Len;
  312. move(newstr[1],DestStr[MatchPos],size_newstr);
  313. Matchpos := MatchPos+size_newstr;
  314. end
  315. else for i := 0 to racount -1 do
  316. begin
  317. MoveLen := ReplArr[i].Pos-LastPos;
  318. move(src[LastPos],DestStr[MatchPos],MoveLen);
  319. MatchPos:=MatchPos+MoveLen;
  320. LastPos := ReplArr[i].Len;
  321. end;
  322. move(src[LastPos],DestStr[MatchPos],length(src)-LastPos+1);
  323. Result := racount;
  324. end;
  325. {--------}
  326. function TRegexEngine.Parse(var aErrorPos : integer;
  327. var aErrorCode: TRegexError)
  328. : boolean;
  329. begin
  330. {clear the current transition table}
  331. rcClear;
  332. {empty regex strings are not allowed}
  333. if (FRegexStr = '') then begin
  334. Result := false;
  335. aErrorPos := 1;
  336. aErrorCode := recSuddenEnd;
  337. Exit;
  338. end;
  339. {parse the regex string}
  340. if not IgnoreCase then
  341. begin
  342. if length(FRegexStr)=1 then
  343. FRegexType:=rtSingleChar
  344. else
  345. FRegexType:=rtChars
  346. end
  347. else
  348. FRegexType:=rtRegEx;
  349. FPosn := PAnsiChar(FRegexStr);
  350. FStartState := rcParseAnchorExpr;
  351. {if an error occurred or we're not at the end of the regex string,
  352. clear the transition table, return false and the error position}
  353. if (FStartState = ErrorState) or (FPosn^ <> #0) then begin
  354. if (FStartState <> ErrorState) and (FPosn^ <> #0) then
  355. FErrorCode := recExtraChars;
  356. rcClear;
  357. Result := false;
  358. aErrorPos := succ(FPosn - PAnsiChar(FRegexStr));
  359. aErrorCode := FErrorCode;
  360. end
  361. {otherwise add a terminal state, optimize, return true}
  362. else begin
  363. rcAddState(mtTerminal, #0, nil, UnusedState, UnusedState);
  364. rcLevel1Optimize;
  365. if FAnchorStart or FAnchorEnd then FRegexType:= rtRegEx;
  366. Result := true;
  367. aErrorPos := 0;
  368. aErrorCode := recNone;
  369. end;
  370. end;
  371. {--------}
  372. function TRegexEngine.rcAddState(aMatchType : TNFAMatchType;
  373. aChar : AnsiChar;
  374. aCharClass : PCharSet;
  375. aNextState1: integer;
  376. aNextState2: integer) : integer;
  377. begin
  378. {set up the fields in the state record}
  379. with FStateTable[FStateCount] do
  380. begin
  381. if (aNextState1 = NewFinalState) then
  382. sdNextState1 := FStateCount+1
  383. else
  384. sdNextState1 := aNextState1;
  385. sdNextState2 := aNextState2;
  386. sdMatchType := aMatchType;
  387. if (aMatchType = mtChar) then
  388. sdChar := aChar
  389. else if (aMatchType = mtClass) or (aMatchType = mtNegClass) then
  390. sdClass := aCharClass;
  391. end;
  392. Result := FStateCount;
  393. inc(FStateCount);
  394. if FStateCount=length(FStateTable) then
  395. setlength(FStateTable,(FStateCount * 3) div 2);
  396. if not (aMatchType in [mtChar,mtTerminal]) then FRegexType := rtRegEx;
  397. end;
  398. {--------}
  399. procedure TRegexEngine.rcClear;
  400. var
  401. i : integer;
  402. begin
  403. {free all items in the state transition table}
  404. for i := 0 to FStateCount-1 do begin
  405. with FStateTable[i] do begin
  406. if (sdMatchType = mtClass) or
  407. (sdMatchType = mtNegClass) then
  408. if (sdClass <> nil) then
  409. FreeMem(sdClass, sizeof(TCharSet));
  410. end;
  411. end;
  412. {clear the state transition table}
  413. FStateCount:=0;
  414. FAnchorStart := false;
  415. FAnchorEnd := false;
  416. end;
  417. {--------}
  418. procedure TRegexEngine.rcError(aIndex : integer);
  419. begin
  420. raise Exception.Create(Format(eRegexParseError,[aIndex]));
  421. end;
  422. {--------}
  423. procedure TRegexEngine.rcLevel1Optimize;
  424. var
  425. i : integer;
  426. Walker : integer;
  427. begin
  428. {level 1 optimization removes all states that have only a single
  429. no-cost move to another state}
  430. {cycle through all the state records, except for the last one}
  431. for i := 0 to FStateCount - 2 do begin
  432. {get this state}
  433. with FStateTable[i] do begin
  434. {walk the chain pointed to by the first next state, unlinking
  435. the states that are simple single no-cost moves}
  436. Walker := sdNextState1;
  437. while (FStateTable[walker].sdMatchType = mtNone) and
  438. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  439. sdNextState1 := FStateTable[walker].sdNextState1;
  440. Walker := sdNextState1;
  441. end;
  442. {walk the chain pointed to by the second next state, unlinking
  443. the states that are simple single no-cost moves}
  444. if (sdNextState2 <> UnusedState) then begin
  445. Walker := sdNextState2;
  446. while (FStateTable[walker].sdMatchType = mtNone) and
  447. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  448. sdNextState2 := FStateTable[walker].sdNextState1;
  449. Walker := sdNextState2;
  450. end;
  451. end;
  452. end;
  453. end;
  454. {cycle through all the state records, except for the last one,
  455. marking unused ones--not strictly necessary but good for debugging}
  456. for i := 0 to FStateCount - 2 do begin
  457. with FStateTable[i] do begin
  458. if (sdMatchType = mtNone) and
  459. (sdNextState2 = UnusedState) then
  460. sdMatchType := mtUnused;
  461. end;
  462. end;
  463. end;
  464. {--------}
  465. function TRegexEngine.rcMatchSubString(const s : string;
  466. StartPosn : integer;
  467. var Len : integer)
  468. : boolean;
  469. var
  470. Ch : AnsiChar;
  471. State : integer;
  472. StrInx : integer;
  473. begin
  474. {assume we fail to match}
  475. Result := false;
  476. Len := StartPosn;
  477. {clear the deque}
  478. FHead := FCapacity div 2;
  479. FTail := FHead;
  480. {enqueue the special value to start scanning}
  481. DequeEnqueue(MustScan);
  482. {enqueue the first state}
  483. DequeEnqueue(FStartState);
  484. {prepare the string index}
  485. StrInx := StartPosn;
  486. {loop until the deque is empty or we run out of string}
  487. repeat
  488. {pop the top state from the deque}
  489. State := DequePop;
  490. {process the "must scan" state first}
  491. if (State = MustScan) then begin
  492. {if the deque is empty at this point, we might as well give up
  493. since there are no states left to process new characters}
  494. if (FHead <> FTail) then begin
  495. {if we haven't run out of string, get the character, and
  496. enqueue the "must scan" state again}
  497. if IgnoreCase then
  498. Ch := Upcase(s[StrInx])
  499. else
  500. Ch := s[StrInx];
  501. DequeEnqueue(MustScan);
  502. inc(StrInx);
  503. end;
  504. end
  505. {otherwise, process the state}
  506. else with FStateTable[State] do begin
  507. case sdMatchType of
  508. mtChar :
  509. begin
  510. {for a match of a character, enqueue the next state}
  511. if (Ch = sdChar) then
  512. DequeEnqueue(sdNextState1);
  513. end;
  514. mtAnyChar :
  515. begin
  516. {for a match of any character, enqueue the next state}
  517. if not (Ch in newline) then
  518. DequeEnqueue(sdNextState1);
  519. end;
  520. mtClass :
  521. begin
  522. {for a match within a class, enqueue the next state}
  523. if (Ch in sdClass^) then
  524. DequeEnqueue(sdNextState1);
  525. end;
  526. mtNegClass :
  527. begin
  528. {for a match not within a class, enqueue the next state}
  529. if not (Ch in sdClass^) then
  530. DequeEnqueue(sdNextState1);
  531. end;
  532. mtTerminal :
  533. begin
  534. {for a terminal state, the string successfully matched
  535. if the regex had no end anchor, or we're at the end
  536. of the string or line}
  537. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  538. Result := true;
  539. Len := StrInx-1;
  540. // Exit;
  541. end;
  542. end;
  543. mtNone :
  544. begin
  545. {for free moves, push the next states onto the deque}
  546. Assert(sdNextState2 <> UnusedState,
  547. 'optimization should remove all states with one no-cost move');
  548. DequePush(sdNextState2);
  549. DequePush(sdNextState1);
  550. end;
  551. mtUnused :
  552. begin
  553. Assert(false, 'unused states shouldn''t be seen');
  554. end;
  555. end;
  556. end;
  557. until (FHead = FTail) or (ch = #0); // deque empty or end of string
  558. {if we reach this point we've either exhausted the deque or we've
  559. run out of string; if the former, the substring did not match
  560. since there are no more states. If the latter, we need to check
  561. the states left on the deque to see if one is the terminating
  562. state; if so the string matched the regular expression defined by
  563. the transition table}
  564. while (FHead <> FTail) do begin
  565. State := DequePop;
  566. with FStateTable[State] do begin
  567. case sdMatchType of
  568. mtNone :
  569. begin
  570. {for free moves, push the next states onto the deque}
  571. Assert(sdNextState2 <> UnusedState,
  572. 'optimization should remove all states with one no-cost move');
  573. DequePush(sdNextState2);
  574. DequePush(sdNextState1);
  575. end;
  576. mtTerminal :
  577. begin
  578. {for a terminal state, the string successfully matched
  579. if the regex had no end anchor, or we're at the end
  580. of the string or line}
  581. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  582. Result := true;
  583. Len := StrInx -1;
  584. Exit;
  585. end;
  586. end;
  587. end;{case}
  588. end;
  589. end;
  590. end;
  591. {--------}
  592. function TRegexEngine.rcParseAnchorExpr : integer;
  593. begin
  594. {check for an initial '^'}
  595. if (FPosn^ = '^') then begin
  596. FAnchorStart := true;
  597. inc(FPosn);
  598. end;
  599. {parse an expression}
  600. Result := rcParseExpr;
  601. {if we were successful, check for the final '$'}
  602. if (Result <> ErrorState) then begin
  603. if (FPosn^ = '$') then begin
  604. FAnchorEnd := true;
  605. inc(FPosn);
  606. end;
  607. end;
  608. end;
  609. {--------}
  610. function TRegexEngine.rcParseAtom : integer;
  611. var
  612. MatchType : TNFAMatchType;
  613. CharClass : PCharSet;
  614. begin
  615. case FPosn^ of
  616. '(' :
  617. begin
  618. {move past the open parenthesis}
  619. inc(FPosn);
  620. {parse a complete regex between the parentheses}
  621. Result := rcParseExpr;
  622. if (Result = ErrorState) then
  623. Exit;
  624. {if the current character is not a close parenthesis,
  625. there's an error}
  626. if (FPosn^ <> ')') then begin
  627. FErrorCode := recNoCloseParen;
  628. Result := ErrorState;
  629. Exit;
  630. end;
  631. {move past the close parenthesis}
  632. inc(FPosn);
  633. end;
  634. '[' :
  635. begin
  636. {move past the open square bracket}
  637. inc(FPosn);
  638. {if the first character in the class is a '^' then the
  639. class if negated, otherwise it's a normal one}
  640. if (FPosn^ = '^') then begin
  641. inc(FPosn);
  642. MatchType := mtNegClass;
  643. end
  644. else begin
  645. MatchType := mtClass;
  646. end;
  647. {allocate the class character set and parse the character
  648. class; this will return either with an error, or when the
  649. closing square bracket is encountered}
  650. New(CharClass);
  651. CharClass^ := [];
  652. if not rcParseCharClass(CharClass) then begin
  653. Dispose(CharClass);
  654. Result := ErrorState;
  655. Exit;
  656. end;
  657. {move past the closing square bracket}
  658. Assert(FPosn^ = ']',
  659. 'the rcParseCharClass terminated without finding a "]"');
  660. inc(FPosn);
  661. {add a new state for the character class}
  662. Result := rcAddState(MatchType, #0, CharClass,
  663. NewFinalState, UnusedState);
  664. end;
  665. '.' :
  666. begin
  667. {move past the period metacharacter}
  668. inc(FPosn);
  669. {add a new state for the 'any character' token}
  670. Result := rcAddState(mtAnyChar, #0, nil,
  671. NewFinalState, UnusedState);
  672. end;
  673. else
  674. {otherwise parse a single character}
  675. Result := rcParseChar;
  676. end;{case}
  677. end;
  678. {--------}
  679. function TRegexEngine.rcParseCCChar(out EscapeChar : Boolean) : AnsiChar;
  680. begin
  681. EscapeChar:=False;
  682. {if we hit the end of the string, it's an error}
  683. if (FPosn^ = #0) then begin
  684. FErrorCode := recSuddenEnd;
  685. Result := #0;
  686. Exit;
  687. end;
  688. {if the current char is a metacharacter (at least in terms of a
  689. character class), it's an error}
  690. if FPosn^ in [']', '-'] then begin
  691. FErrorCode := recMetaChar;
  692. Result := #0;
  693. Exit;
  694. end;
  695. {otherwise return the character and advance past it}
  696. if (FPosn^ = '\') then
  697. {..it's an escaped character: get the next character instead}
  698. begin
  699. inc(FPosn);
  700. EscapeChar:=True;
  701. Result := rcReturnEscapeChar;
  702. end
  703. else
  704. Result := FPosn^;
  705. inc(FPosn);
  706. end;
  707. {--------}
  708. function TRegexEngine.rcParseChar : integer;
  709. var
  710. Ch : AnsiChar;
  711. begin
  712. {if we hit the end of the string, it's an error}
  713. if (FPosn^ = #0) then begin
  714. Result := ErrorState;
  715. FErrorCode := recSuddenEnd;
  716. Exit;
  717. end;
  718. {if the current char is one of the metacharacters, it's an error}
  719. if FPosn^ in MetaCharacters then begin
  720. Result := ErrorState;
  721. FErrorCode := recMetaChar;
  722. Exit;
  723. end;
  724. {otherwise add a state for the character}
  725. {..if it's an escaped character: get the next character instead}
  726. if (FPosn^ = '\') then
  727. begin
  728. inc(FPosn);
  729. ch := rcReturnEscapeChar;
  730. end
  731. else
  732. ch :=FPosn^;
  733. if IgnoreCase then
  734. Ch := Upcase(ch);
  735. Result := rcAddState(mtChar, Ch, nil, NewFinalState, UnusedState);
  736. inc(FPosn);
  737. end;
  738. {--------}
  739. function TRegexEngine.rcParseCharClass(aClass : PCharSet) : boolean;
  740. begin
  741. {assume we can't parse a character class properly}
  742. Result := false;
  743. {parse a character range; if we can't there was an error and the
  744. caller will take care of it}
  745. if not rcParseCharRange(aClass) then
  746. Exit;
  747. {if the current character was not the right bracket, parse another
  748. character class (note: we're removing the tail recursion here)}
  749. while (FPosn^ <> ']') do begin
  750. if not rcParseCharRange(aClass) then
  751. Exit;
  752. end;
  753. {if we reach here we were successful}
  754. Result := true;
  755. end;
  756. {--------}
  757. function TRegexEngine.rcParseCharRange(aClass : PCharSet) : boolean;
  758. var
  759. StartChar : AnsiChar;
  760. EndChar : AnsiChar;
  761. Ch : AnsiChar;
  762. EscChar : Boolean;
  763. begin
  764. {assume we can't parse a character range properly}
  765. Result := false;
  766. {parse a single character; if it's null there was an error}
  767. StartChar := rcParseCCChar(EscChar);
  768. if (StartChar = #0) then
  769. Exit;
  770. if EscChar then
  771. begin
  772. case StartChar of
  773. 'd' : aClass^ := aClass^ + cs_digits;
  774. 'D' : aClass^ := aClass^ + cs_allchars-cs_digits;
  775. 's' : aClass^ := aClass^ + cs_whitespace;
  776. 'S' : aClass^ := aClass^ + cs_allchars-cs_whitespace;
  777. 'w' : aClass^ := aClass^ + cs_wordchars;
  778. 'W' : aClass^ := aClass^ + cs_allchars-cs_wordchars
  779. else
  780. EscChar := False;
  781. end;
  782. if EscChar then
  783. begin
  784. Result := True;
  785. Exit;
  786. end;
  787. end;
  788. {if the current character is not a dash, the range consisted of a
  789. single character}
  790. if (FPosn^ <> '-') then begin
  791. if IgnoreCase then
  792. Include(aClass^, Upcase(StartChar))
  793. else
  794. Include(aClass^, StartChar)
  795. end
  796. {otherwise it's a real range, so get the character at the end of the
  797. range; if that's null, there was an error}
  798. else begin
  799. inc(FPosn); {move past the '-'}
  800. EndChar := rcParseCCChar(EscChar);
  801. if (EndChar = #0) then
  802. Exit;
  803. {build the range as a character set}
  804. if (StartChar > EndChar) then begin
  805. Ch := StartChar;
  806. StartChar := EndChar;
  807. EndChar := Ch;
  808. end;
  809. for Ch := StartChar to EndChar do begin
  810. Include(aClass^, Ch);
  811. if IgnoreCase then
  812. Include(aClass^, Upcase(Ch));
  813. end;
  814. end;
  815. {if we reach here we were successful}
  816. Result := true;
  817. end;
  818. {--------}
  819. function TRegexEngine.rcParseExpr : integer;
  820. var
  821. StartState1 : integer;
  822. StartState2 : integer;
  823. EndState1 : integer;
  824. OverallStartState : integer;
  825. begin
  826. {assume the worst}
  827. Result := ErrorState;
  828. {parse an initial term}
  829. StartState1 := rcParseTerm;
  830. if (StartState1 = ErrorState) then
  831. Exit;
  832. {if the current character is *not* a pipe character, no alternation
  833. is present so return the start state of the initial term as our
  834. start state}
  835. if (FPosn^ <> '|') then
  836. Result := StartState1
  837. {otherwise, we need to parse another expr and join the two together
  838. in the transition table}
  839. else begin
  840. {advance past the pipe}
  841. inc(FPosn);
  842. {the initial term's end state does not exist yet (although there
  843. is a state in the term that points to it), so create it}
  844. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  845. {for the OR construction we need a new initial state: it will
  846. point to the initial term and the second just-about-to-be-parsed
  847. expr}
  848. OverallStartState := rcAddState(mtNone, #0, nil,
  849. UnusedState, UnusedState);
  850. {parse another expr}
  851. StartState2 := rcParseExpr;
  852. if (StartState2 = ErrorState) then
  853. Exit;
  854. {alter the state state for the overall expr so that the second
  855. link points to the start of the second expr}
  856. Result := rcSetState(OverallStartState, StartState1, StartState2);
  857. {now set the end state for the initial term to point to the final
  858. end state for the second expr and the overall expr}
  859. rcSetState(EndState1, FStateCount, UnusedState);
  860. end;
  861. end;
  862. {--------}
  863. function TRegexEngine.rcParseFactor : integer;
  864. var
  865. StartStateAtom : integer;
  866. EndStateAtom : integer;
  867. TempEndStateAtom : integer;
  868. Int : string;
  869. n,m,nState : integer;
  870. i : integer;
  871. begin
  872. {assume the worst}
  873. Result := ErrorState;
  874. {first parse an atom}
  875. StartStateAtom := rcParseAtom;
  876. if (StartStateAtom = ErrorState) then
  877. Exit;
  878. {check for a closure operator}
  879. case FPosn^ of
  880. '?' : begin
  881. {move past the ? operator}
  882. inc(FPosn);
  883. {the atom's end state doesn't exist yet, so create one}
  884. EndStateAtom := rcAddState(mtNone, #0, nil,
  885. UnusedState, UnusedState);
  886. {create a new start state for the overall regex}
  887. Result := rcAddState(mtNone, #0, nil,
  888. StartStateAtom, EndStateAtom);
  889. {make sure the new end state points to the next unused
  890. state}
  891. rcSetState(EndStateAtom, FStateCount, UnusedState);
  892. end;
  893. '*' : begin
  894. {move past the * operator}
  895. inc(FPosn);
  896. {the atom's end state doesn't exist yet, so create one;
  897. it'll be the start of the overall regex subexpression}
  898. Result := rcAddState(mtNone, #0, nil,
  899. NewFinalState, StartStateAtom);
  900. end;
  901. '+' : begin
  902. {move past the + operator}
  903. inc(FPosn);
  904. {the atom's end state doesn't exist yet, so create one}
  905. rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom);
  906. {the start of the overall regex subexpression will be the
  907. atom's start state}
  908. Result := StartStateAtom;
  909. end;
  910. '{' : begin // {n,m}
  911. {move past the brace }
  912. inc(FPosn);
  913. {Parse the value of n}
  914. Int := '';
  915. while not (FPosn^ in [',','}',#0]) do
  916. begin
  917. int := int+FPosn^;
  918. inc(FPosn);
  919. end;
  920. if FPosn^ = #0 then exit; // No end-brace or comma -> invalid regex
  921. if int <> '' then
  922. n := StrToIntDef(Int,-2)
  923. else
  924. n := -1; // if n is 'empty', set it to -1
  925. if n = -2 then exit; // Invalid value for n -> invalid RegEx
  926. if FPosn^ <> '}' then
  927. begin
  928. {move past the , }
  929. inc(FPosn);
  930. {Parse the value of m}
  931. Int := '';
  932. while not (FPosn^ in ['}',#0]) do
  933. begin
  934. int := int+FPosn^;
  935. inc(FPosn);
  936. end;
  937. if FPosn^ <> '}' then exit; // No end-brace -> invalid regex
  938. if int <> '' then m := StrToIntDef(Int,-2)
  939. else m := -1;
  940. if m = -2 then exit; // Invalid RegEx
  941. end
  942. else
  943. m := -3;
  944. {move past the brace }
  945. inc(FPosn);
  946. if (n=0) and (m=-1) then
  947. {the atom's end state doesn't exist yet, so create one;
  948. it'll be the start of the overall regex subexpression}
  949. Result := rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom)
  950. else
  951. begin
  952. EndStateAtom := FStateCount-1;
  953. TempEndStateAtom:=StartStateAtom;
  954. for i := 1 to n-1 do
  955. begin
  956. TempEndStateAtom:=FStateCount;
  957. for nState:=StartStateAtom to EndStateAtom do
  958. begin
  959. FStateTable[FStateCount]:=FStateTable[nState];
  960. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  961. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) *i;
  962. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  963. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) *i;
  964. inc(FStateCount);
  965. if FStateCount=length(FStateTable) then
  966. setlength(FStateTable,(FStateCount * 3) div 2);
  967. end;
  968. end;
  969. for i := n to m-1 do
  970. begin
  971. rcAddState(mtNone, #0, nil, NewFinalState, EndStateAtom+(EndStateAtom-StartStateAtom+1) * (m-1) + (m-n)+1);
  972. TempEndStateAtom:=FStateCount;
  973. for nState:=StartStateAtom to EndStateAtom do
  974. begin
  975. FStateTable[FStateCount]:=FStateTable[nState];
  976. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  977. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) * i+(i-n+1);
  978. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  979. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) * i+(i-n+1);
  980. inc(FStateCount);
  981. if FStateCount=length(FStateTable) then
  982. setlength(FStateTable,(FStateCount * 3) div 2);
  983. end;
  984. end;
  985. if m = -1 then
  986. rcAddState(mtNone, #0, nil, NewFinalState, TempEndStateAtom);
  987. Result := StartStateAtom;
  988. end;
  989. end;
  990. else
  991. Result := StartStateAtom;
  992. end;{case}
  993. end;
  994. {--------}
  995. function TRegexEngine.rcParseTerm : integer;
  996. var
  997. StartState2 : integer;
  998. EndState1 : integer;
  999. begin
  1000. {parse an initial factor, the state number returned will also be our
  1001. return state number}
  1002. Result := rcParseFactor;
  1003. if (Result = ErrorState) then
  1004. Exit;
  1005. {Note: we have to "break the grammar" here. We've parsed a regular
  1006. subexpression and we're possibly following on with another
  1007. regular subexpression. There's no nice operator to key off
  1008. for concatenation: we just have to know that for
  1009. concatenating two subexpressions, the current character will
  1010. be
  1011. - an open parenthesis
  1012. - an open square bracket
  1013. - an any char operator
  1014. - a character that's not a metacharacter
  1015. i.e., the three possibilities for the start of an "atom" in
  1016. our grammar}
  1017. if (FPosn^ = '(') or
  1018. (FPosn^ = '[') or
  1019. (FPosn^ = '.') or
  1020. ((FPosn^ <> #0) and not (FPosn^ in MetaCharacters)) then begin
  1021. {the initial factor's end state does not exist yet (although there
  1022. is a state in the term that points to it), so create it}
  1023. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  1024. {parse another term}
  1025. StartState2 := rcParseTerm;
  1026. if (StartState2 = ErrorState) then begin
  1027. Result := ErrorState;
  1028. Exit;
  1029. end;
  1030. {join the first factor to the second term}
  1031. rcSetState(EndState1, StartState2, UnusedState);
  1032. end;
  1033. end;
  1034. procedure TRegexEngine.WriteTable;
  1035. var i : integer;
  1036. begin
  1037. for i := 0 to FStateCount-1 do with FStateTable[i] do
  1038. // FPC version 2.2.1 can not write enum-types. Disabled in fixes_2_2 only
  1039. // writeln('s:',i,' mt:',sdMatchType ,' ns1:',sdNextState1,' ns2:',sdNextState2,' char:',sdChar);
  1040. writeln('s:',i,' ns1:',sdNextState1,' ns2:',sdNextState2,' char:',sdChar);
  1041. end;
  1042. procedure TRegexEngine.DequeEnqueue(aValue: integer);
  1043. begin
  1044. FList[FTail] := aValue;
  1045. inc(FTail);
  1046. if (FTail = FCapacity) then
  1047. FTail := 0
  1048. else if (FTail = FHead) then
  1049. DequeGrow;
  1050. end;
  1051. procedure TRegexEngine.DequePush(aValue: integer);
  1052. begin
  1053. if (FHead = 0) then
  1054. FHead := FCapacity;
  1055. dec(FHead);
  1056. FList[FHead] := aValue;
  1057. if (FTail = FHead) then
  1058. DequeGrow;
  1059. end;
  1060. function TRegexEngine.DequePop: integer;
  1061. begin
  1062. Result := FList[FHead];
  1063. inc(FHead);
  1064. if (FHead = FCapacity) then
  1065. FHead := 0;
  1066. end;
  1067. procedure TRegexEngine.DequeGrow;
  1068. var
  1069. OldCount : integer;
  1070. i, j : integer;
  1071. begin
  1072. {grow the list by 50%}
  1073. OldCount := FCapacity;
  1074. FCapacity:=(OldCount * 3) div 2;
  1075. SetLength(FList,FCapacity);
  1076. {expand the data into the increased space, maintaining the deque}
  1077. if (FHead = 0) then
  1078. FTail := OldCount
  1079. else begin
  1080. j := FCapacity;
  1081. for i := pred(OldCount) downto FHead do begin
  1082. dec(j);
  1083. FList[j] := FList[i]
  1084. end;
  1085. FHead := j;
  1086. end;
  1087. end;
  1088. function TRegexEngine.rcReturnEscapeChar: AnsiChar;
  1089. begin
  1090. case FPosn^ of
  1091. 't' : Result := #9;
  1092. 'n' : Result := #10;
  1093. 'r' : Result := #13;
  1094. 'f' : Result := #12;
  1095. 'a' : Result := #7;
  1096. else
  1097. Result := FPosn^;
  1098. end;
  1099. end;
  1100. {--------}
  1101. procedure TRegexEngine.rcSetIgnoreCase(aValue : boolean);
  1102. begin
  1103. if (aValue <> FIgnoreCase) then begin
  1104. rcClear;
  1105. FIgnoreCase := aValue;
  1106. end;
  1107. end;
  1108. {--------}
  1109. procedure TRegexEngine.rcSetRegexStr(const aRegexStr : string);
  1110. begin
  1111. if (aRegexStr <> FRegexStr) then begin
  1112. rcClear;
  1113. FRegexStr := aRegexStr;
  1114. end;
  1115. end;
  1116. {--------}
  1117. function TRegexEngine.rcSetState(aState : integer;
  1118. aNextState1: integer;
  1119. aNextState2: integer) : integer;
  1120. begin
  1121. Assert((0 <= aState) and (aState < FStateCount),
  1122. 'trying to change an invalid state');
  1123. {get the state record and change the transition information}
  1124. FStateTable[aState].sdNextState1 := aNextState1;
  1125. FStateTable[aState].sdNextState2 := aNextState2;
  1126. Result := aState;
  1127. end;
  1128. {--------}
  1129. procedure TRegexEngine.rcSetUpcase(aValue : TUpcaseFunc);
  1130. begin
  1131. if not Assigned(aValue) then
  1132. FUpcase := SystemUpcase
  1133. else
  1134. FUpcase := aValue;
  1135. end;
  1136. procedure TRegexEngine.rcSetMultiLine(aValue: Boolean);
  1137. begin
  1138. FMultiLine:=aValue;
  1139. end;
  1140. {====================================================================}
  1141. end.