regex.pp 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271
  1. {
  2. This file is part of the Free Pascal packages library.
  3. Copyright (c) 2008 by Joost van der Sluis, member of the
  4. Free Pascal development team
  5. Regexpression parser
  6. This code is based on the examples in the book
  7. 'Tomes of Delphi: Algorithms and Data Structures' by Julian M Bucknall
  8. The code is used with his permission. For an excellent explanation of
  9. this unit, see the book...
  10. See the file COPYING.FPC, included in this distribution,
  11. for details about the copyright.
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. **********************************************************************}
  16. unit Regex;
  17. {$mode Delphi}{$H+}
  18. {$INLINE ON}
  19. interface
  20. {Notes:
  21. these classes parse regular expressions that follow this grammar:
  22. <anchorexpr> ::= <expr> |
  23. '^' <expr> |
  24. <expr> '$' |
  25. '^' <expr> '$'
  26. <expr> ::= <term> |
  27. <term> '|' <expr> - alternation
  28. <term> ::= <factor> |
  29. <factor><term> - concatenation
  30. <factor> ::= <atom> |
  31. <atom> '?' | - zero or one
  32. <atom> '*' | - zero or more
  33. <atom> 'n,m' | - min n, max m (added by Joost)
  34. <atom> '+' - one or more
  35. <atom> ::= <char> |
  36. '.' | - any char
  37. '(' <expr> ') | - parentheses
  38. '[' <charclass> ']' | - normal class
  39. '[^' <charclass> ']' - negated class
  40. <charclass> ::= <charrange> |
  41. <charrange><charclass>
  42. <charrange> ::= <ccchar> |
  43. <ccchar> '-' <ccchar>
  44. <char> ::= <any character except metacharacters> |
  45. '\' <any character at all>
  46. <ccchar> ::= <any character except '-' and ']'> |
  47. '\' <any character at all>
  48. This means that parentheses have maximum precedence, followed
  49. by square brackets, followed by the closure operators,
  50. followed by concatenation, finally followed by alternation.
  51. }
  52. uses
  53. SysUtils,
  54. Classes;
  55. type
  56. TUpcaseFunc = function(aCh : AnsiChar) : AnsiChar;
  57. TNFAMatchType = ( {types of matching performed...}
  58. mtNone, {..no match (an epsilon no-cost move)}
  59. mtAnyChar, {..any character}
  60. mtChar, {..a particular character}
  61. mtClass, {..a character class}
  62. mtDupClass, {..a character class beying referenced}
  63. mtNegClass, {..a negated character class}
  64. mtTerminal, {..the final state--no matching}
  65. mtUnused); {..an unused state--no matching}
  66. TRegexError = ( {error codes for invalid regex strings}
  67. recNone, {..no error}
  68. recSuddenEnd, {..unexpected end of string}
  69. recMetaChar, {..read metacharacter, but needed normal char}
  70. recNoCloseParen, {..expected close paren, but not there}
  71. recExtraChars {..not at end of string after parsing regex}
  72. );
  73. TRegexType = (
  74. rtRegEx,
  75. rtChars,
  76. rtSingleChar
  77. );
  78. PCharSet = ^TCharSet;
  79. TCharSet = set of Char;
  80. { TtdRegexEngine }
  81. TNFAState = record
  82. sdNextState1: integer;
  83. sdNextState2: integer;
  84. sdClass : PCharSet;
  85. sdMatchType : TNFAMatchType;
  86. sdChar : AnsiChar;
  87. end;
  88. { TRegexEngine }
  89. TRegexEngine = class
  90. private
  91. FAnchorEnd : boolean;
  92. FAnchorStart: boolean;
  93. FErrorCode : TRegexError;
  94. FIgnoreCase : boolean;
  95. FMultiLine : boolean;
  96. FPosn : PAnsiChar;
  97. FRegexStr : string;
  98. FStartState : integer;
  99. FStateTable : Array of TNFAState;
  100. FStateCount : integer;
  101. FUpcase : TUpcaseFunc;
  102. // The deque (double-ended queue)
  103. FList : array of integer;
  104. FCapacity : integer;
  105. FHead : integer;
  106. FTail : integer;
  107. FRegexType : TRegexType;
  108. protected
  109. procedure DequeEnqueue(aValue : integer);
  110. procedure DequePush(aValue : integer);
  111. function DequePop : integer;
  112. procedure DequeGrow;
  113. procedure rcSetIgnoreCase(aValue : boolean); virtual;
  114. procedure rcSetRegexStr(const aRegexStr : string); virtual;
  115. procedure rcSetUpcase(aValue : TUpcaseFunc); virtual;
  116. procedure rcSetMultiLine(aValue : Boolean); virtual;
  117. procedure rcClear; virtual;
  118. procedure rcError(aIndex : integer); virtual;
  119. procedure rcLevel1Optimize; virtual;
  120. function rcMatchSubString(const S : string;
  121. StartPosn : integer;
  122. var Len : integer) : boolean; virtual;
  123. function rcAddState(aMatchType : TNFAMatchType;
  124. aChar : AnsiChar;
  125. aCharClass : PCharSet;
  126. aNextState1: integer;
  127. aNextState2: integer) : integer;
  128. function rcSetState(aState : integer;
  129. aNextState1: integer;
  130. aNextState2: integer) : integer;
  131. function rcParseAnchorExpr : integer; virtual;
  132. function rcParseAtom : integer; virtual;
  133. function rcParseCCChar(out EscapeChar : Boolean) : AnsiChar; virtual;
  134. function rcParseChar : integer; virtual;
  135. function rcParseCharClass(aClass : PCharSet) : boolean; virtual;
  136. function rcParseCharRange(aClass : PCharSet) : boolean; virtual;
  137. function rcParseExpr : integer; virtual;
  138. function rcParseFactor : integer; virtual;
  139. function rcParseTerm : integer; virtual;
  140. Function rcReturnEscapeChar : AnsiChar; virtual;
  141. public
  142. procedure WriteTable;
  143. constructor Create(const aRegexStr : string);
  144. destructor Destroy; override;
  145. function Parse(out aErrorPos : integer;
  146. out aErrorCode: TRegexError) : boolean; virtual;
  147. function MatchString(const S : string; out MatchPos : integer; var Offset : integer) : boolean; virtual;
  148. function ReplaceAllString(const src, newstr: ansistring; out DestStr : string): Integer;
  149. property IgnoreCase : boolean
  150. read FIgnoreCase write rcSetIgnoreCase;
  151. property MultiLine : boolean
  152. read FMultiLine write rcSetMultiLine;
  153. property RegexString : string
  154. read FRegexStr write rcSetRegexStr;
  155. property Upcase : TUpcaseFunc
  156. read FUpcase write rcSetUpcase;
  157. end;
  158. Resourcestring
  159. eRegexParseError = 'Error at %d when parsing regular expression';
  160. implementation
  161. uses strutils;
  162. const
  163. MetaCharacters : set of AnsiChar =
  164. ['[', ']', '(', ')', '|', '*', '+', '?', '-', '.',
  165. '^', '$', '{', '}'];
  166. newline : TCharSet = [#10,#13,#$85];
  167. {some handy constants}
  168. UnusedState = -1;
  169. NewFinalState = -2;
  170. CreateNewState = -3;
  171. ErrorState = -4;
  172. MustScan = -5;
  173. cs_allchars : tcharset = [#0..#255];
  174. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  175. cs_newline : tcharset = [#10];
  176. cs_digits : tcharset = ['0'..'9'];
  177. cs_whitespace : tcharset = [' ',#9];
  178. {===Helper routines==================================================}
  179. function SystemUpcase(aCh : AnsiChar) : AnsiChar; far;
  180. begin
  181. Result := System.Upcase(aCh);
  182. end;
  183. {====================================================================}
  184. {===TRegexEngine===================================================}
  185. constructor TRegexEngine.Create(const aRegexStr : string);
  186. begin
  187. inherited Create;
  188. FRegexStr := aRegexStr;
  189. FIgnoreCase := false;
  190. FUpcase := SystemUpcase;
  191. SetLength(FStateTable,64);
  192. FStateCount:=0;
  193. FCapacity:=64;
  194. setlength(FList,FCapacity);
  195. {let's help out the user of the deque by putting the head and
  196. tail pointers in the middle: it's probably more efficient}
  197. FHead := FCapacity div 2;
  198. FTail := FHead;
  199. MultiLine:=False;
  200. end;
  201. {--------}
  202. destructor TRegexEngine.Destroy;
  203. begin
  204. if (FStateTable <> nil) then
  205. rcClear;
  206. inherited Destroy;
  207. end;
  208. {--------}
  209. function TRegexEngine.MatchString(const S : string; out MatchPos : integer; var Offset : integer): boolean;
  210. var
  211. i : integer;
  212. ErrorPos : integer;
  213. ErrorCode : TRegexError;
  214. pc : pchar;
  215. x:integer;
  216. begin
  217. if Offset>length(S) then
  218. begin
  219. Result := False;
  220. MatchPos := 0;
  221. Exit;
  222. end;
  223. {if the regex string hasn't been parsed yet, do so}
  224. if (FStateCount = 0) then begin
  225. if not Parse(ErrorPos, ErrorCode) then
  226. rcError(ErrorPos);
  227. end;
  228. case FRegexType of
  229. rtSingleChar :
  230. begin
  231. MatchPos := PosEx(char(FRegexStr[1]),s,Offset);
  232. Offset := MatchPos+1;
  233. Result := (MatchPos>0);
  234. end;
  235. rtChars :
  236. begin
  237. MatchPos := PosEx(FRegexStr,s,Offset);
  238. Offset := MatchPos+length(FRegexStr);
  239. Result := (MatchPos>0);
  240. end
  241. else
  242. begin
  243. {now try and see if the string matches (empty strings don't)}
  244. Result := False;
  245. MatchPos := 0;
  246. if (S <> '') then
  247. {if the regex specified a start anchor then we
  248. need to check the string starting at the first position}
  249. if FAnchorStart then begin
  250. if rcMatchSubString(S, 1, Offset) then
  251. begin
  252. MatchPos:=1;
  253. Result := True;
  254. end
  255. {If the first position did not match ang MultiLine is false, the string
  256. doesn't match. If MultiLine is true, start at every position after a
  257. newline }
  258. else if FMultiLine then begin
  259. for i := Offset to length(S)-1 do
  260. if S[i] in newline then
  261. if rcMatchSubString(S, i+1, Offset) then begin
  262. MatchPos := i+1;
  263. Result := True;
  264. Break;
  265. end;
  266. end
  267. end
  268. {otherwise we try and match the string at every position and
  269. return at the first success}
  270. else begin
  271. for i := Offset to length(S) do
  272. if rcMatchSubString(S, i, Offset) then begin
  273. MatchPos:=i;
  274. Result := True;
  275. Break;
  276. end;
  277. end;
  278. end;
  279. end; {case}
  280. end;
  281. function TRegexEngine.ReplaceAllString(const src, newstr: ansistring; out DestStr : string): Integer;
  282. type TReplRec = record
  283. Pos : integer;
  284. Len : integer;
  285. end;
  286. var ofs : Integer;
  287. size_newstr,
  288. size, pos : Integer;
  289. ReplArr : array of TReplRec;
  290. racount : integer;
  291. MatchPos : integer;
  292. DestSize : integer;
  293. LastPos : integer;
  294. MoveLen : integer;
  295. i : integer;
  296. begin
  297. setlength(ReplArr,64);
  298. racount := 0;
  299. DestSize:=length(src);
  300. size_newstr := length(newstr);
  301. Ofs := 1;
  302. while MatchString(src,MatchPos,Ofs) do
  303. begin
  304. if racount = length(ReplArr) then
  305. setlength(ReplArr,racount+racount div 2);
  306. ReplArr[racount].Pos := MatchPos;
  307. ReplArr[racount].Len := ofs;
  308. DestSize:=DestSize-ofs+MatchPos+size_newstr;
  309. inc(racount);
  310. end;
  311. SetLength(DestStr, SizeOf(Char)*DestSize);
  312. MatchPos:=1; LastPos:=1;
  313. if size_newstr<>0 then for i := 0 to racount -1 do
  314. begin
  315. MoveLen := ReplArr[i].Pos-LastPos;
  316. move(src[LastPos],DestStr[MatchPos],MoveLen);
  317. MatchPos:=MatchPos+MoveLen;
  318. LastPos := ReplArr[i].Len;
  319. move(newstr[1],DestStr[MatchPos],size_newstr);
  320. Matchpos := MatchPos+size_newstr;
  321. end
  322. else for i := 0 to racount -1 do
  323. begin
  324. MoveLen := ReplArr[i].Pos-LastPos;
  325. move(src[LastPos],DestStr[MatchPos],MoveLen);
  326. MatchPos:=MatchPos+MoveLen;
  327. LastPos := ReplArr[i].Len;
  328. end;
  329. move(src[LastPos],DestStr[MatchPos],length(src)-LastPos+1);
  330. Result := racount;
  331. end;
  332. {--------}
  333. function TRegexEngine.Parse(out aErrorPos : integer;
  334. out aErrorCode: TRegexError)
  335. : boolean;
  336. begin
  337. {clear the current transition table}
  338. rcClear;
  339. {empty regex strings are not allowed}
  340. if (FRegexStr = '') then begin
  341. Result := false;
  342. aErrorPos := 1;
  343. aErrorCode := recSuddenEnd;
  344. Exit;
  345. end;
  346. {parse the regex string}
  347. if not IgnoreCase then
  348. begin
  349. if length(FRegexStr)=1 then
  350. FRegexType:=rtSingleChar
  351. else
  352. FRegexType:=rtChars
  353. end
  354. else
  355. FRegexType:=rtRegEx;
  356. FPosn := PAnsiChar(FRegexStr);
  357. FStartState := rcParseAnchorExpr;
  358. {if an error occurred or we're not at the end of the regex string,
  359. clear the transition table, return false and the error position}
  360. if (FStartState = ErrorState) or (FPosn^ <> #0) then begin
  361. if (FStartState <> ErrorState) and (FPosn^ <> #0) then
  362. FErrorCode := recExtraChars;
  363. rcClear;
  364. Result := false;
  365. aErrorPos := succ(FPosn - PAnsiChar(FRegexStr));
  366. aErrorCode := FErrorCode;
  367. end
  368. {otherwise add a terminal state, optimize, return true}
  369. else begin
  370. rcAddState(mtTerminal, #0, nil, UnusedState, UnusedState);
  371. rcLevel1Optimize;
  372. if FAnchorStart or FAnchorEnd then FRegexType:= rtRegEx;
  373. Result := true;
  374. aErrorPos := 0;
  375. aErrorCode := recNone;
  376. end;
  377. end;
  378. {--------}
  379. function TRegexEngine.rcAddState(aMatchType : TNFAMatchType;
  380. aChar : AnsiChar;
  381. aCharClass : PCharSet;
  382. aNextState1: integer;
  383. aNextState2: integer) : integer;
  384. begin
  385. {set up the fields in the state record}
  386. with FStateTable[FStateCount] do
  387. begin
  388. if (aNextState1 = NewFinalState) then
  389. sdNextState1 := FStateCount+1
  390. else
  391. sdNextState1 := aNextState1;
  392. sdNextState2 := aNextState2;
  393. sdMatchType := aMatchType;
  394. if (aMatchType = mtChar) then
  395. sdChar := aChar
  396. else if aMatchType in [mtClass, mtDupClass, mtNegClass] then
  397. sdClass := aCharClass;
  398. end;
  399. Result := FStateCount;
  400. inc(FStateCount);
  401. if FStateCount=length(FStateTable) then
  402. setlength(FStateTable,(FStateCount * 3) div 2);
  403. if not (aMatchType in [mtChar,mtTerminal,mtNone]) then FRegexType := rtRegEx;
  404. end;
  405. {--------}
  406. procedure TRegexEngine.rcClear;
  407. var
  408. i, j : integer;
  409. begin
  410. {free all items in the state transition table}
  411. for i := 0 to FStateCount-1 do begin
  412. with FStateTable[i] do begin
  413. if (sdMatchType = mtClass) or
  414. (sdMatchType = mtNegClass) and
  415. (sdClass <> nil) then
  416. begin
  417. for j := i+1 to FStateCount-1 do
  418. if (FStateTable[j].sdClass = sdClass) then
  419. FStateTable[j].sdClass := nil;
  420. FreeMem(sdClass, sizeof(TCharSet));
  421. end;
  422. // I am not sure if the next line is necessary. rcAddState set all values, so
  423. // it shouldn't be necessary to clear its contents?
  424. // FillChar(FStateTable[i],SizeOf(FStateTable[i]),#0);
  425. end;
  426. end;
  427. {clear the state transition table}
  428. FStateCount:=0;
  429. FAnchorStart := false;
  430. FAnchorEnd := false;
  431. end;
  432. {--------}
  433. procedure TRegexEngine.rcError(aIndex : integer);
  434. begin
  435. raise Exception.Create(Format(eRegexParseError,[aIndex]));
  436. end;
  437. {--------}
  438. procedure TRegexEngine.rcLevel1Optimize;
  439. var
  440. i : integer;
  441. Walker : integer;
  442. begin
  443. {level 1 optimization removes all states that have only a single
  444. no-cost move to another state}
  445. {cycle through all the state records, except for the last one}
  446. for i := 0 to FStateCount - 2 do begin
  447. {get this state}
  448. with FStateTable[i] do begin
  449. {walk the chain pointed to by the first next state, unlinking
  450. the states that are simple single no-cost moves}
  451. Walker := sdNextState1;
  452. while (FStateTable[walker].sdMatchType = mtNone) and
  453. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  454. sdNextState1 := FStateTable[walker].sdNextState1;
  455. Walker := sdNextState1;
  456. end;
  457. {walk the chain pointed to by the second next state, unlinking
  458. the states that are simple single no-cost moves}
  459. if (sdNextState2 <> UnusedState) then begin
  460. Walker := sdNextState2;
  461. while (FStateTable[walker].sdMatchType = mtNone) and
  462. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  463. sdNextState2 := FStateTable[walker].sdNextState1;
  464. Walker := sdNextState2;
  465. end;
  466. end;
  467. end;
  468. end;
  469. {cycle through all the state records, except for the last one,
  470. marking unused ones--not strictly necessary but good for debugging}
  471. for i := 0 to FStateCount - 2 do begin
  472. with FStateTable[i] do begin
  473. if (sdMatchType = mtNone) and
  474. (sdNextState2 = UnusedState) then
  475. sdMatchType := mtUnused;
  476. end;
  477. end;
  478. end;
  479. {--------}
  480. function TRegexEngine.rcMatchSubString(const s : string;
  481. StartPosn : integer;
  482. var Len : integer)
  483. : boolean;
  484. var
  485. Ch : AnsiChar;
  486. State : integer;
  487. StrInx : integer;
  488. begin
  489. {assume we fail to match}
  490. Result := false;
  491. Len := StartPosn;
  492. {clear the deque}
  493. FHead := FCapacity div 2;
  494. FTail := FHead;
  495. {enqueue the special value to start scanning}
  496. DequeEnqueue(MustScan);
  497. {enqueue the first state}
  498. DequeEnqueue(FStartState);
  499. {prepare the string index}
  500. StrInx := StartPosn;
  501. {loop until the deque is empty or we run out of string}
  502. repeat
  503. {pop the top state from the deque}
  504. State := DequePop;
  505. {process the "must scan" state first}
  506. if (State = MustScan) then begin
  507. {if the deque is empty at this point, we might as well give up
  508. since there are no states left to process new characters}
  509. if (FHead <> FTail) then begin
  510. {if we haven't run out of string, get the character, and
  511. enqueue the "must scan" state again}
  512. if IgnoreCase then
  513. Ch := Upcase(s[StrInx])
  514. else
  515. Ch := s[StrInx];
  516. DequeEnqueue(MustScan);
  517. inc(StrInx);
  518. end;
  519. end
  520. {otherwise, process the state}
  521. else with FStateTable[State] do begin
  522. case sdMatchType of
  523. mtChar :
  524. begin
  525. {for a match of a character, enqueue the next state}
  526. if (Ch = sdChar) then
  527. DequeEnqueue(sdNextState1);
  528. end;
  529. mtAnyChar :
  530. begin
  531. {for a match of any character, enqueue the next state}
  532. if not (Ch in newline) then
  533. DequeEnqueue(sdNextState1);
  534. end;
  535. mtClass, mtDupClass :
  536. begin
  537. {for a match within a class, enqueue the next state}
  538. if (Ch in sdClass^) then
  539. DequeEnqueue(sdNextState1);
  540. end;
  541. mtNegClass :
  542. begin
  543. {for a match not within a class, enqueue the next state}
  544. if not (Ch in sdClass^) then
  545. DequeEnqueue(sdNextState1);
  546. end;
  547. mtTerminal :
  548. begin
  549. {for a terminal state, the string successfully matched
  550. if the regex had no end anchor, or we're at the end
  551. of the string or line}
  552. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  553. Result := true;
  554. Len := StrInx-1;
  555. // Exit;
  556. end;
  557. end;
  558. mtNone :
  559. begin
  560. {for free moves, push the next states onto the deque}
  561. Assert(sdNextState2 <> UnusedState,
  562. 'optimization should remove all states with one no-cost move');
  563. DequePush(sdNextState2);
  564. DequePush(sdNextState1);
  565. end;
  566. mtUnused :
  567. begin
  568. Assert(false, 'unused states shouldn''t be seen');
  569. end;
  570. end;
  571. end;
  572. until (FHead = FTail) or (ch = #0); // deque empty or end of string
  573. {if we reach this point we've either exhausted the deque or we've
  574. run out of string; if the former, the substring did not match
  575. since there are no more states. If the latter, we need to check
  576. the states left on the deque to see if one is the terminating
  577. state; if so the string matched the regular expression defined by
  578. the transition table}
  579. while (FHead <> FTail) do begin
  580. State := DequePop;
  581. with FStateTable[State] do begin
  582. case sdMatchType of
  583. mtNone :
  584. begin
  585. {for free moves, push the next states onto the deque}
  586. Assert(sdNextState2 <> UnusedState,
  587. 'optimization should remove all states with one no-cost move');
  588. DequePush(sdNextState2);
  589. DequePush(sdNextState1);
  590. end;
  591. mtTerminal :
  592. begin
  593. {for a terminal state, the string successfully matched
  594. if the regex had no end anchor, or we're at the end
  595. of the string or line}
  596. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  597. Result := true;
  598. Len := StrInx -1;
  599. Exit;
  600. end;
  601. end;
  602. end;{case}
  603. end;
  604. end;
  605. end;
  606. {--------}
  607. function TRegexEngine.rcParseAnchorExpr : integer;
  608. begin
  609. {check for an initial '^'}
  610. if (FPosn^ = '^') then begin
  611. FAnchorStart := true;
  612. inc(FPosn);
  613. end;
  614. {parse an expression}
  615. Result := rcParseExpr;
  616. {if we were successful, check for the final '$'}
  617. if (Result <> ErrorState) then begin
  618. if (FPosn^ = '$') then begin
  619. FAnchorEnd := true;
  620. inc(FPosn);
  621. end;
  622. end;
  623. end;
  624. {--------}
  625. function TRegexEngine.rcParseAtom : integer;
  626. var
  627. MatchType : TNFAMatchType;
  628. CharClass : PCharSet;
  629. begin
  630. case FPosn^ of
  631. '(' :
  632. begin
  633. {move past the open parenthesis}
  634. inc(FPosn);
  635. {parse a complete regex between the parentheses}
  636. Result := rcParseExpr;
  637. if (Result = ErrorState) then
  638. Exit;
  639. {if the current character is not a close parenthesis,
  640. there's an error}
  641. if (FPosn^ <> ')') then begin
  642. FErrorCode := recNoCloseParen;
  643. Result := ErrorState;
  644. Exit;
  645. end;
  646. {move past the close parenthesis}
  647. inc(FPosn);
  648. {always handle expressions with parentheses as regular-expression}
  649. FRegexType := rtRegEx;
  650. end;
  651. '[' :
  652. begin
  653. {move past the open square bracket}
  654. inc(FPosn);
  655. {if the first character in the class is a '^' then the
  656. class if negated, otherwise it's a normal one}
  657. if (FPosn^ = '^') then begin
  658. inc(FPosn);
  659. MatchType := mtNegClass;
  660. end
  661. else begin
  662. MatchType := mtClass;
  663. end;
  664. {allocate the class character set and parse the character
  665. class; this will return either with an error, or when the
  666. closing square bracket is encountered}
  667. New(CharClass);
  668. CharClass^ := [];
  669. if not rcParseCharClass(CharClass) then begin
  670. Dispose(CharClass);
  671. Result := ErrorState;
  672. Exit;
  673. end;
  674. {move past the closing square bracket}
  675. Assert(FPosn^ = ']',
  676. 'the rcParseCharClass terminated without finding a "]"');
  677. inc(FPosn);
  678. {add a new state for the character class}
  679. Result := rcAddState(MatchType, #0, CharClass,
  680. NewFinalState, UnusedState);
  681. end;
  682. '.' :
  683. begin
  684. {move past the period metacharacter}
  685. inc(FPosn);
  686. {add a new state for the 'any character' token}
  687. Result := rcAddState(mtAnyChar, #0, nil,
  688. NewFinalState, UnusedState);
  689. end;
  690. '\' :
  691. begin
  692. if (FPosn+1)^ in ['d','D','s','S','w','W'] then begin
  693. New(CharClass);
  694. CharClass^ := [];
  695. if not rcParseCharRange(CharClass) then begin
  696. Dispose(CharClass);
  697. Result := ErrorState;
  698. Exit;
  699. end;
  700. Result := rcAddState(mtClass, #0, CharClass,
  701. NewFinalState, UnusedState);
  702. end
  703. else
  704. Result := rcParseChar;
  705. end;
  706. else
  707. {otherwise parse a single character}
  708. Result := rcParseChar;
  709. end;{case}
  710. end;
  711. {--------}
  712. function TRegexEngine.rcParseCCChar(out EscapeChar : Boolean) : AnsiChar;
  713. begin
  714. EscapeChar:=False;
  715. {if we hit the end of the string, it's an error}
  716. if (FPosn^ = #0) then begin
  717. FErrorCode := recSuddenEnd;
  718. Result := #0;
  719. Exit;
  720. end;
  721. {if the current char is a metacharacter (at least in terms of a
  722. character class), it's an error}
  723. if FPosn^ in [']', '-'] then begin
  724. FErrorCode := recMetaChar;
  725. Result := #0;
  726. Exit;
  727. end;
  728. {otherwise return the character and advance past it}
  729. if (FPosn^ = '\') then
  730. {..it's an escaped character: get the next character instead}
  731. begin
  732. inc(FPosn);
  733. EscapeChar:=True;
  734. Result := rcReturnEscapeChar;
  735. end
  736. else
  737. Result := FPosn^;
  738. inc(FPosn);
  739. end;
  740. {--------}
  741. function TRegexEngine.rcParseChar : integer;
  742. var
  743. Ch : AnsiChar;
  744. begin
  745. {if we hit the end of the string, it's an error}
  746. if (FPosn^ = #0) then begin
  747. Result := ErrorState;
  748. FErrorCode := recSuddenEnd;
  749. Exit;
  750. end;
  751. {if the current char is one of the metacharacters, it's an error}
  752. if FPosn^ in MetaCharacters then begin
  753. Result := ErrorState;
  754. FErrorCode := recMetaChar;
  755. Exit;
  756. end;
  757. {otherwise add a state for the character}
  758. {..if it's an escaped character: get the next character instead}
  759. if (FPosn^ = '\') then
  760. begin
  761. inc(FPosn);
  762. ch := rcReturnEscapeChar;
  763. FRegexType := rtRegEx;
  764. end
  765. else
  766. ch :=FPosn^;
  767. if IgnoreCase then
  768. Ch := Upcase(ch);
  769. Result := rcAddState(mtChar, Ch, nil, NewFinalState, UnusedState);
  770. inc(FPosn);
  771. end;
  772. {--------}
  773. function TRegexEngine.rcParseCharClass(aClass : PCharSet) : boolean;
  774. begin
  775. {assume we can't parse a character class properly}
  776. Result := false;
  777. {parse a character range; if we can't there was an error and the
  778. caller will take care of it}
  779. if not rcParseCharRange(aClass) then
  780. Exit;
  781. {if the current character was not the right bracket, parse another
  782. character class (note: we're removing the tail recursion here)}
  783. while (FPosn^ <> ']') do begin
  784. if not rcParseCharRange(aClass) then
  785. Exit;
  786. end;
  787. {if we reach here we were successful}
  788. Result := true;
  789. end;
  790. {--------}
  791. function TRegexEngine.rcParseCharRange(aClass : PCharSet) : boolean;
  792. var
  793. StartChar : AnsiChar;
  794. EndChar : AnsiChar;
  795. Ch : AnsiChar;
  796. EscChar : Boolean;
  797. begin
  798. {assume we can't parse a character range properly}
  799. Result := false;
  800. {parse a single character; if it's null there was an error}
  801. StartChar := rcParseCCChar(EscChar);
  802. if (StartChar = #0) then
  803. Exit;
  804. if EscChar then
  805. begin
  806. case StartChar of
  807. 'd' : aClass^ := aClass^ + cs_digits;
  808. 'D' : aClass^ := aClass^ + cs_allchars-cs_digits;
  809. 's' : aClass^ := aClass^ + cs_whitespace;
  810. 'S' : aClass^ := aClass^ + cs_allchars-cs_whitespace;
  811. 'w' : aClass^ := aClass^ + cs_wordchars;
  812. 'W' : aClass^ := aClass^ + cs_allchars-cs_wordchars
  813. else
  814. EscChar := False;
  815. end;
  816. if EscChar then
  817. begin
  818. Result := True;
  819. Exit;
  820. end;
  821. end;
  822. {if the current character is not a dash, the range consisted of a
  823. single character}
  824. if (FPosn^ <> '-') then begin
  825. if IgnoreCase then
  826. Include(aClass^, Upcase(StartChar))
  827. else
  828. Include(aClass^, StartChar)
  829. end
  830. {otherwise it's a real range, so get the character at the end of the
  831. range; if that's null, there was an error}
  832. else begin
  833. inc(FPosn); {move past the '-'}
  834. EndChar := rcParseCCChar(EscChar);
  835. if (EndChar = #0) then
  836. Exit;
  837. {build the range as a character set}
  838. if (StartChar > EndChar) then begin
  839. Ch := StartChar;
  840. StartChar := EndChar;
  841. EndChar := Ch;
  842. end;
  843. for Ch := StartChar to EndChar do begin
  844. Include(aClass^, Ch);
  845. if IgnoreCase then
  846. Include(aClass^, Upcase(Ch));
  847. end;
  848. end;
  849. {if we reach here we were successful}
  850. Result := true;
  851. end;
  852. {--------}
  853. function TRegexEngine.rcParseExpr : integer;
  854. var
  855. StartState1 : integer;
  856. StartState2 : integer;
  857. EndState1 : integer;
  858. OverallStartState : integer;
  859. begin
  860. {assume the worst}
  861. Result := ErrorState;
  862. {parse an initial term}
  863. StartState1 := rcParseTerm;
  864. if (StartState1 = ErrorState) then
  865. Exit;
  866. {if the current character is *not* a pipe character, no alternation
  867. is present so return the start state of the initial term as our
  868. start state}
  869. if (FPosn^ <> '|') then
  870. Result := StartState1
  871. {otherwise, we need to parse another expr and join the two together
  872. in the transition table}
  873. else begin
  874. {advance past the pipe}
  875. inc(FPosn);
  876. {the initial term's end state does not exist yet (although there
  877. is a state in the term that points to it), so create it}
  878. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  879. {for the OR construction we need a new initial state: it will
  880. point to the initial term and the second just-about-to-be-parsed
  881. expr}
  882. OverallStartState := rcAddState(mtNone, #0, nil,
  883. UnusedState, UnusedState);
  884. {parse another expr}
  885. StartState2 := rcParseExpr;
  886. if (StartState2 = ErrorState) then
  887. Exit;
  888. {alter the state state for the overall expr so that the second
  889. link points to the start of the second expr}
  890. Result := rcSetState(OverallStartState, StartState1, StartState2);
  891. {now set the end state for the initial term to point to the final
  892. end state for the second expr and the overall expr}
  893. rcSetState(EndState1, FStateCount, UnusedState);
  894. {always handle expressions with a pipe as regular-expression}
  895. FRegexType := rtRegEx;
  896. end;
  897. end;
  898. {--------}
  899. function TRegexEngine.rcParseFactor : integer;
  900. var
  901. StartStateAtom : integer;
  902. EndStateAtom : integer;
  903. TempEndStateAtom : integer;
  904. Int : string;
  905. n,m,nState : integer;
  906. i : integer;
  907. begin
  908. {assume the worst}
  909. Result := ErrorState;
  910. {first parse an atom}
  911. StartStateAtom := rcParseAtom;
  912. if (StartStateAtom = ErrorState) then
  913. Exit;
  914. {check for a closure operator}
  915. case FPosn^ of
  916. '?' : begin
  917. {move past the ? operator}
  918. inc(FPosn);
  919. {the atom's end state doesn't exist yet, so create one}
  920. EndStateAtom := rcAddState(mtNone, #0, nil,
  921. UnusedState, UnusedState);
  922. {create a new start state for the overall regex}
  923. Result := rcAddState(mtNone, #0, nil,
  924. StartStateAtom, EndStateAtom);
  925. {make sure the new end state points to the next unused
  926. state}
  927. rcSetState(EndStateAtom, FStateCount, UnusedState);
  928. end;
  929. '*' : begin
  930. {move past the * operator}
  931. inc(FPosn);
  932. {the atom's end state doesn't exist yet, so create one;
  933. it'll be the start of the overall regex subexpression}
  934. Result := rcAddState(mtNone, #0, nil,
  935. NewFinalState, StartStateAtom);
  936. end;
  937. '+' : begin
  938. {move past the + operator}
  939. inc(FPosn);
  940. {the atom's end state doesn't exist yet, so create one}
  941. rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom);
  942. {the start of the overall regex subexpression will be the
  943. atom's start state}
  944. Result := StartStateAtom;
  945. end;
  946. '{' : begin // {n,m}
  947. {move past the brace }
  948. inc(FPosn);
  949. {Parse the value of n}
  950. Int := '';
  951. while not (FPosn^ in [',','}',#0]) do
  952. begin
  953. int := int+FPosn^;
  954. inc(FPosn);
  955. end;
  956. if FPosn^ = #0 then exit; // No end-brace or comma -> invalid regex
  957. if int <> '' then
  958. n := StrToIntDef(Int,-2)
  959. else
  960. n := -1; // if n is 'empty', set it to -1
  961. if n = -2 then exit; // Invalid value for n -> invalid RegEx
  962. if FPosn^ <> '}' then
  963. begin
  964. {move past the , }
  965. inc(FPosn);
  966. {Parse the value of m}
  967. Int := '';
  968. while not (FPosn^ in ['}',#0]) do
  969. begin
  970. int := int+FPosn^;
  971. inc(FPosn);
  972. end;
  973. if FPosn^ <> '}' then exit; // No end-brace -> invalid regex
  974. if int <> '' then m := StrToIntDef(Int,-2)
  975. else m := -1;
  976. if m = -2 then exit; // Invalid RegEx
  977. end
  978. else
  979. m := -3;
  980. {move past the brace }
  981. inc(FPosn);
  982. if (n=0) and (m=-1) then
  983. {the atom's end state doesn't exist yet, so create one;
  984. it'll be the start of the overall regex subexpression}
  985. Result := rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom)
  986. else
  987. begin
  988. EndStateAtom := FStateCount-1;
  989. TempEndStateAtom:=StartStateAtom;
  990. for i := 1 to n-1 do
  991. begin
  992. TempEndStateAtom:=FStateCount;
  993. for nState:=StartStateAtom to EndStateAtom do
  994. begin
  995. FStateTable[FStateCount]:=FStateTable[nState];
  996. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  997. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) *i;
  998. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  999. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) *i;
  1000. if FStateTable[FStateCount].sdMatchType = mtClass then
  1001. FStateTable[FStateCount].sdMatchType := mtDupClass;
  1002. inc(FStateCount);
  1003. if FStateCount=length(FStateTable) then
  1004. setlength(FStateTable,(FStateCount * 3) div 2);
  1005. end;
  1006. end;
  1007. for i := n to m-1 do
  1008. begin
  1009. rcAddState(mtNone, #0, nil, NewFinalState, EndStateAtom+(EndStateAtom-StartStateAtom+1) * (m-1) + (m-n)+1);
  1010. TempEndStateAtom:=FStateCount;
  1011. for nState:=StartStateAtom to EndStateAtom do
  1012. begin
  1013. FStateTable[FStateCount]:=FStateTable[nState];
  1014. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  1015. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) * i+(i-n+1);
  1016. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  1017. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) * i+(i-n+1);
  1018. if FStateTable[FStateCount].sdMatchType = mtClass then
  1019. FStateTable[FStateCount].sdMatchType := mtDupClass;
  1020. inc(FStateCount);
  1021. if FStateCount=length(FStateTable) then
  1022. setlength(FStateTable,(FStateCount * 3) div 2);
  1023. end;
  1024. end;
  1025. if m = -1 then
  1026. rcAddState(mtNone, #0, nil, NewFinalState, TempEndStateAtom);
  1027. Result := StartStateAtom;
  1028. end;
  1029. {always handle expressions with braces as regular-expression}
  1030. FRegexType := rtRegEx;
  1031. end;
  1032. else
  1033. Result := StartStateAtom;
  1034. end;{case}
  1035. end;
  1036. {--------}
  1037. function TRegexEngine.rcParseTerm : integer;
  1038. var
  1039. StartState2 : integer;
  1040. EndState1 : integer;
  1041. begin
  1042. {parse an initial factor, the state number returned will also be our
  1043. return state number}
  1044. Result := rcParseFactor;
  1045. if (Result = ErrorState) then
  1046. Exit;
  1047. {Note: we have to "break the grammar" here. We've parsed a regular
  1048. subexpression and we're possibly following on with another
  1049. regular subexpression. There's no nice operator to key off
  1050. for concatenation: we just have to know that for
  1051. concatenating two subexpressions, the current character will
  1052. be
  1053. - an open parenthesis
  1054. - an open square bracket
  1055. - an any char operator
  1056. - a character that's not a metacharacter
  1057. i.e., the three possibilities for the start of an "atom" in
  1058. our grammar}
  1059. if (FPosn^ = '(') or
  1060. (FPosn^ = '[') or
  1061. (FPosn^ = '.') or
  1062. ((FPosn^ <> #0) and not (FPosn^ in MetaCharacters)) then begin
  1063. {the initial factor's end state does not exist yet (although there
  1064. is a state in the term that points to it), so create it}
  1065. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  1066. {parse another term}
  1067. StartState2 := rcParseTerm;
  1068. if (StartState2 = ErrorState) then begin
  1069. Result := ErrorState;
  1070. Exit;
  1071. end;
  1072. {join the first factor to the second term}
  1073. rcSetState(EndState1, StartState2, UnusedState);
  1074. end;
  1075. end;
  1076. procedure TRegexEngine.WriteTable;
  1077. var i : integer;
  1078. begin
  1079. for i := 0 to FStateCount-1 do with FStateTable[i] do
  1080. writeln('s:',i,' mt:',sdMatchType ,' ns1:',sdNextState1,' ns2:',sdNextState2,' char:',sdChar);
  1081. end;
  1082. procedure TRegexEngine.DequeEnqueue(aValue: integer);
  1083. begin
  1084. FList[FTail] := aValue;
  1085. inc(FTail);
  1086. if (FTail = FCapacity) then
  1087. FTail := 0
  1088. else if (FTail = FHead) then
  1089. DequeGrow;
  1090. end;
  1091. procedure TRegexEngine.DequePush(aValue: integer);
  1092. begin
  1093. if (FHead = 0) then
  1094. FHead := FCapacity;
  1095. dec(FHead);
  1096. FList[FHead] := aValue;
  1097. if (FTail = FHead) then
  1098. DequeGrow;
  1099. end;
  1100. function TRegexEngine.DequePop: integer;
  1101. begin
  1102. Result := FList[FHead];
  1103. inc(FHead);
  1104. if (FHead = FCapacity) then
  1105. FHead := 0;
  1106. end;
  1107. procedure TRegexEngine.DequeGrow;
  1108. var
  1109. OldCount : integer;
  1110. i, j : integer;
  1111. begin
  1112. {grow the list by 50%}
  1113. OldCount := FCapacity;
  1114. FCapacity:=(OldCount * 3) div 2;
  1115. SetLength(FList,FCapacity);
  1116. {expand the data into the increased space, maintaining the deque}
  1117. if (FHead = 0) then
  1118. FTail := OldCount
  1119. else begin
  1120. j := FCapacity;
  1121. for i := pred(OldCount) downto FHead do begin
  1122. dec(j);
  1123. FList[j] := FList[i]
  1124. end;
  1125. FHead := j;
  1126. end;
  1127. end;
  1128. function TRegexEngine.rcReturnEscapeChar: AnsiChar;
  1129. begin
  1130. case FPosn^ of
  1131. 't' : Result := #9;
  1132. 'n' : Result := #10;
  1133. 'r' : Result := #13;
  1134. 'f' : Result := #12;
  1135. 'a' : Result := #7;
  1136. else
  1137. Result := FPosn^;
  1138. end;
  1139. end;
  1140. {--------}
  1141. procedure TRegexEngine.rcSetIgnoreCase(aValue : boolean);
  1142. begin
  1143. if (aValue <> FIgnoreCase) then begin
  1144. rcClear;
  1145. FIgnoreCase := aValue;
  1146. end;
  1147. end;
  1148. {--------}
  1149. procedure TRegexEngine.rcSetRegexStr(const aRegexStr : string);
  1150. begin
  1151. if (aRegexStr <> FRegexStr) then begin
  1152. rcClear;
  1153. FRegexStr := aRegexStr;
  1154. end;
  1155. end;
  1156. {--------}
  1157. function TRegexEngine.rcSetState(aState : integer;
  1158. aNextState1: integer;
  1159. aNextState2: integer) : integer;
  1160. begin
  1161. Assert((0 <= aState) and (aState < FStateCount),
  1162. 'trying to change an invalid state');
  1163. {get the state record and change the transition information}
  1164. FStateTable[aState].sdNextState1 := aNextState1;
  1165. FStateTable[aState].sdNextState2 := aNextState2;
  1166. Result := aState;
  1167. end;
  1168. {--------}
  1169. procedure TRegexEngine.rcSetUpcase(aValue : TUpcaseFunc);
  1170. begin
  1171. if not Assigned(aValue) then
  1172. FUpcase := SystemUpcase
  1173. else
  1174. FUpcase := aValue;
  1175. end;
  1176. procedure TRegexEngine.rcSetMultiLine(aValue: Boolean);
  1177. begin
  1178. FMultiLine:=aValue;
  1179. end;
  1180. {====================================================================}
  1181. end.