regex.pp 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286
  1. {
  2. This file is part of the Free Pascal packages library.
  3. Copyright (c) 2008 by Joost van der Sluis, member of the
  4. Free Pascal development team
  5. Regexpression parser
  6. This code is based on the examples in the book
  7. 'Tomes of Delphi: Algorithms and Data Structures' by Julian M Bucknall
  8. The code is used with his permission. For an excellent explanation of
  9. this unit, see the book...
  10. See the file COPYING.FPC, included in this distribution,
  11. for details about the copyright.
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. **********************************************************************}
  16. {$IFNDEF FPC_DOTTEDUNITS}
  17. unit Regex;
  18. {$ENDIF FPC_DOTTEDUNITS}
  19. {$mode Delphi}
  20. {$H-}
  21. {$INLINE ON}
  22. interface
  23. {Notes:
  24. these classes parse regular expressions that follow this grammar:
  25. <anchorexpr> ::= <expr> |
  26. '^' <expr> |
  27. <expr> '$' |
  28. '^' <expr> '$'
  29. <expr> ::= <term> |
  30. <term> '|' <expr> - alternation
  31. <term> ::= <factor> |
  32. <factor><term> - concatenation
  33. <factor> ::= <atom> |
  34. <atom> '?' | - zero or one
  35. <atom> '*' | - zero or more
  36. <atom> 'n,m' | - min n, max m (added by Joost)
  37. <atom> '+' - one or more
  38. <atom> ::= <AnsiChar> |
  39. '.' | - any AnsiChar
  40. '(' <expr> ') | - parentheses
  41. '[' <charclass> ']' | - normal class
  42. '[^' <charclass> ']' - negated class
  43. <charclass> ::= <charrange> |
  44. <charrange><charclass>
  45. <charrange> ::= <ccchar> |
  46. <ccchar> '-' <ccchar>
  47. <AnsiChar> ::= <any character except metacharacters> |
  48. '\' <any character at all>
  49. <ccchar> ::= <any character except '-' and ']'> |
  50. '\' <any character at all>
  51. This means that parentheses have maximum precedence, followed
  52. by square brackets, followed by the closure operators,
  53. followed by concatenation, finally followed by alternation.
  54. }
  55. {$IFDEF FPC_DOTTEDUNITS}
  56. uses
  57. System.SysUtils,
  58. System.Classes;
  59. {$ELSE FPC_DOTTEDUNITS}
  60. uses
  61. SysUtils,
  62. Classes;
  63. {$ENDIF FPC_DOTTEDUNITS}
  64. type
  65. TUpcaseFunc = function(aCh : AnsiChar) : AnsiChar;
  66. TNFAMatchType = ( {types of matching performed...}
  67. mtNone, {..no match (an epsilon no-cost move)}
  68. mtAnyChar, {..any character}
  69. mtChar, {..a particular character}
  70. mtClass, {..a character class}
  71. mtDupClass, {..a character class beying referenced}
  72. mtNegClass, {..a negated character class}
  73. mtTerminal, {..the final state--no matching}
  74. mtUnused); {..an unused state--no matching}
  75. TRegexError = ( {error codes for invalid regex strings}
  76. recNone, {..no error}
  77. recSuddenEnd, {..unexpected end of String}
  78. recMetaChar, {..read metacharacter, but needed normal AnsiChar}
  79. recNoCloseParen, {..expected close paren, but not there}
  80. recExtraChars {..not at end of string after parsing regex}
  81. );
  82. TRegexType = (
  83. rtRegEx,
  84. rtChars,
  85. rtSingleChar
  86. );
  87. PCharSet = ^TCharSet;
  88. TCharSet = set of AnsiChar;
  89. { TtdRegexEngine }
  90. TNFAState = record
  91. sdNextState1: integer;
  92. sdNextState2: integer;
  93. sdClass : PCharSet;
  94. sdMatchType : TNFAMatchType;
  95. sdChar : AnsiChar;
  96. end;
  97. { TRegexEngine }
  98. TRegexEngine = class
  99. private
  100. FAnchorEnd : boolean;
  101. FAnchorStart: boolean;
  102. FErrorCode : TRegexError;
  103. FIgnoreCase : boolean;
  104. FMultiLine : boolean;
  105. FPosn : PAnsiChar;
  106. FRegexStr : AnsiString;
  107. FStartState : integer;
  108. FStateTable : Array of TNFAState;
  109. FStateCount : integer;
  110. FUpcase : TUpcaseFunc;
  111. // The deque (double-ended queue)
  112. FList : array of integer;
  113. FCapacity : integer;
  114. FHead : integer;
  115. FTail : integer;
  116. FRegexType : TRegexType;
  117. protected
  118. procedure DequeEnqueue(aValue : integer);
  119. procedure DequePush(aValue : integer);
  120. function DequePop : integer;
  121. procedure DequeGrow;
  122. procedure rcSetIgnoreCase(aValue : boolean); virtual;
  123. procedure rcSetRegexStr(const aRegexStr : AnsiString); virtual;
  124. procedure rcSetUpcase(aValue : TUpcaseFunc); virtual;
  125. procedure rcSetMultiLine(aValue : Boolean); virtual;
  126. procedure rcClear; virtual;
  127. procedure rcError(aIndex : integer); virtual;
  128. procedure rcLevel1Optimize; virtual;
  129. function rcMatchSubString(const S : AnsiString;
  130. StartPosn : integer;
  131. var Len : integer) : boolean; virtual;
  132. function rcAddState(aMatchType : TNFAMatchType;
  133. aChar : AnsiChar;
  134. aCharClass : PCharSet;
  135. aNextState1: integer;
  136. aNextState2: integer) : integer;
  137. function rcSetState(aState : integer;
  138. aNextState1: integer;
  139. aNextState2: integer) : integer;
  140. function rcParseAnchorExpr : integer; virtual;
  141. function rcParseAtom : integer; virtual;
  142. function rcParseCCChar(out EscapeChar : Boolean) : AnsiChar; virtual;
  143. function rcParseChar : integer; virtual;
  144. function rcParseCharClass(aClass : PCharSet) : boolean; virtual;
  145. function rcParseCharRange(aClass : PCharSet) : boolean; virtual;
  146. function rcParseExpr : integer; virtual;
  147. function rcParseFactor : integer; virtual;
  148. function rcParseTerm : integer; virtual;
  149. Function rcReturnEscapeChar : AnsiChar; virtual;
  150. public
  151. procedure WriteTable;
  152. constructor Create(const aRegexStr : AnsiString);
  153. destructor Destroy; override;
  154. function Parse(out aErrorPos : integer;
  155. out aErrorCode: TRegexError) : boolean; virtual;
  156. function MatchString(const S : AnsiString; out MatchPos : integer; var Offset : integer) : boolean; virtual;
  157. function ReplaceAllString(const src, newstr: ansistring; out DestStr : AnsiString): Integer;
  158. property IgnoreCase : boolean
  159. read FIgnoreCase write rcSetIgnoreCase;
  160. property MultiLine : boolean
  161. read FMultiLine write rcSetMultiLine;
  162. property RegexString : AnsiString
  163. read FRegexStr write rcSetRegexStr;
  164. property Upcase : TUpcaseFunc
  165. read FUpcase write rcSetUpcase;
  166. end;
  167. Resourcestring
  168. eRegexParseError = 'Error at %d when parsing regular expression';
  169. implementation
  170. {$IFDEF FPC_DOTTEDUNITS}
  171. uses System.StrUtils;
  172. {$ELSE FPC_DOTTEDUNITS}
  173. uses strutils;
  174. {$ENDIF FPC_DOTTEDUNITS}
  175. const
  176. MetaCharacters : set of AnsiChar =
  177. ['[', ']', '(', ')', '|', '*', '+', '?', '-', '.',
  178. '^', '$', '{', '}'];
  179. newline : TCharSet = [#10,#13,#$85];
  180. {some handy constants}
  181. UnusedState = -1;
  182. NewFinalState = -2;
  183. CreateNewState = -3;
  184. ErrorState = -4;
  185. MustScan = -5;
  186. cs_allchars : tcharset = [#0..#255];
  187. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  188. cs_newline : tcharset = [#10];
  189. cs_digits : tcharset = ['0'..'9'];
  190. cs_whitespace : tcharset = [' ',#9];
  191. {===Helper routines==================================================}
  192. function SystemUpcase(aCh : AnsiChar) : AnsiChar;
  193. begin
  194. Result := System.Upcase(aCh);
  195. end;
  196. {====================================================================}
  197. {===TRegexEngine===================================================}
  198. constructor TRegexEngine.Create(const aRegexStr : AnsiString);
  199. begin
  200. inherited Create;
  201. FRegexStr := aRegexStr;
  202. FIgnoreCase := false;
  203. FUpcase := SystemUpcase;
  204. SetLength(FStateTable,64);
  205. FStateCount:=0;
  206. FCapacity:=64;
  207. setlength(FList,FCapacity);
  208. {let's help out the user of the deque by putting the head and
  209. tail pointers in the middle: it's probably more efficient}
  210. FHead := FCapacity div 2;
  211. FTail := FHead;
  212. MultiLine:=False;
  213. end;
  214. {--------}
  215. destructor TRegexEngine.Destroy;
  216. begin
  217. if (FStateTable <> nil) then
  218. rcClear;
  219. inherited Destroy;
  220. end;
  221. {--------}
  222. function TRegexEngine.MatchString(const S : AnsiString; out MatchPos : integer; var Offset : integer): boolean;
  223. var
  224. i : integer;
  225. ErrorPos : integer;
  226. ErrorCode : TRegexError;
  227. pc : PAnsiChar;
  228. x:integer;
  229. begin
  230. if Offset>length(S) then
  231. begin
  232. Result := False;
  233. MatchPos := 0;
  234. Exit;
  235. end;
  236. {if the regex string hasn't been parsed yet, do so}
  237. if (FStateCount = 0) then begin
  238. if not Parse(ErrorPos, ErrorCode) then
  239. rcError(ErrorPos);
  240. end;
  241. case FRegexType of
  242. rtSingleChar :
  243. begin
  244. MatchPos := PosEx(AnsiChar(FRegexStr[1]),s,Offset);
  245. Offset := MatchPos+1;
  246. Result := (MatchPos>0);
  247. end;
  248. rtChars :
  249. begin
  250. MatchPos := PosEx(FRegexStr,s,Offset);
  251. Offset := MatchPos+length(FRegexStr);
  252. Result := (MatchPos>0);
  253. end
  254. else
  255. begin
  256. {now try and see if the string matches (empty strings don't)}
  257. Result := False;
  258. MatchPos := 0;
  259. if (S <> '') then
  260. {if the regex specified a start anchor then we
  261. need to check the string starting at the first position}
  262. if FAnchorStart then begin
  263. if rcMatchSubString(S, 1, Offset) then
  264. begin
  265. MatchPos:=1;
  266. Result := True;
  267. end
  268. {If the first position did not match ang MultiLine is false, the string
  269. doesn't match. If MultiLine is true, start at every position after a
  270. newline }
  271. else if FMultiLine then begin
  272. for i := Offset to length(S)-1 do
  273. if S[i] in newline then
  274. if rcMatchSubString(S, i+1, Offset) then begin
  275. MatchPos := i+1;
  276. Result := True;
  277. Break;
  278. end;
  279. end
  280. end
  281. {otherwise we try and match the string at every position and
  282. return at the first success}
  283. else begin
  284. for i := Offset to length(S) do
  285. if rcMatchSubString(S, i, Offset) then begin
  286. MatchPos:=i;
  287. Result := True;
  288. Break;
  289. end;
  290. end;
  291. end;
  292. end; {case}
  293. end;
  294. function TRegexEngine.ReplaceAllString(const src, newstr: ansistring; out DestStr : AnsiString): Integer;
  295. type TReplRec = record
  296. Pos : integer;
  297. Len : integer;
  298. end;
  299. var ofs : Integer;
  300. size_newstr,
  301. size, pos : Integer;
  302. ReplArr : array of TReplRec;
  303. racount : integer;
  304. MatchPos : integer;
  305. DestSize : integer;
  306. LastPos : integer;
  307. MoveLen : integer;
  308. i : integer;
  309. begin
  310. setlength(ReplArr,64);
  311. racount := 0;
  312. DestSize:=length(src);
  313. size_newstr := length(newstr);
  314. Ofs := 1;
  315. while MatchString(src,MatchPos,Ofs) do
  316. begin
  317. if racount = length(ReplArr) then
  318. setlength(ReplArr,racount+racount div 2);
  319. ReplArr[racount].Pos := MatchPos;
  320. ReplArr[racount].Len := ofs;
  321. DestSize:=DestSize-ofs+MatchPos+size_newstr;
  322. inc(racount);
  323. end;
  324. SetLength(DestStr, SizeOf(AnsiChar)*DestSize);
  325. MatchPos:=1; LastPos:=1;
  326. if size_newstr<>0 then for i := 0 to racount -1 do
  327. begin
  328. MoveLen := ReplArr[i].Pos-LastPos;
  329. move(src[LastPos],DestStr[MatchPos],MoveLen);
  330. MatchPos:=MatchPos+MoveLen;
  331. LastPos := ReplArr[i].Len;
  332. move(newstr[1],DestStr[MatchPos],size_newstr);
  333. Matchpos := MatchPos+size_newstr;
  334. end
  335. else for i := 0 to racount -1 do
  336. begin
  337. MoveLen := ReplArr[i].Pos-LastPos;
  338. move(src[LastPos],DestStr[MatchPos],MoveLen);
  339. MatchPos:=MatchPos+MoveLen;
  340. LastPos := ReplArr[i].Len;
  341. end;
  342. move(src[LastPos],DestStr[MatchPos],length(src)-LastPos+1);
  343. Result := racount;
  344. end;
  345. {--------}
  346. function TRegexEngine.Parse(out aErrorPos : integer;
  347. out aErrorCode: TRegexError)
  348. : boolean;
  349. begin
  350. {clear the current transition table}
  351. rcClear;
  352. {empty regex strings are not allowed}
  353. if (FRegexStr = '') then begin
  354. Result := false;
  355. aErrorPos := 1;
  356. aErrorCode := recSuddenEnd;
  357. Exit;
  358. end;
  359. {parse the regex string}
  360. if not IgnoreCase then
  361. begin
  362. if length(FRegexStr)=1 then
  363. FRegexType:=rtSingleChar
  364. else
  365. FRegexType:=rtChars
  366. end
  367. else
  368. FRegexType:=rtRegEx;
  369. FPosn := PAnsiChar(FRegexStr);
  370. FStartState := rcParseAnchorExpr;
  371. {if an error occurred or we're not at the end of the regex string,
  372. clear the transition table, return false and the error position}
  373. if (FStartState = ErrorState) or (FPosn^ <> #0) then begin
  374. if (FStartState <> ErrorState) and (FPosn^ <> #0) then
  375. FErrorCode := recExtraChars;
  376. rcClear;
  377. Result := false;
  378. aErrorPos := succ(FPosn - PAnsiChar(FRegexStr));
  379. aErrorCode := FErrorCode;
  380. end
  381. {otherwise add a terminal state, optimize, return true}
  382. else begin
  383. rcAddState(mtTerminal, #0, nil, UnusedState, UnusedState);
  384. rcLevel1Optimize;
  385. if FAnchorStart or FAnchorEnd then FRegexType:= rtRegEx;
  386. Result := true;
  387. aErrorPos := 0;
  388. aErrorCode := recNone;
  389. end;
  390. end;
  391. {--------}
  392. function TRegexEngine.rcAddState(aMatchType : TNFAMatchType;
  393. aChar : AnsiChar;
  394. aCharClass : PCharSet;
  395. aNextState1: integer;
  396. aNextState2: integer) : integer;
  397. begin
  398. {set up the fields in the state record}
  399. with FStateTable[FStateCount] do
  400. begin
  401. if (aNextState1 = NewFinalState) then
  402. sdNextState1 := FStateCount+1
  403. else
  404. sdNextState1 := aNextState1;
  405. sdNextState2 := aNextState2;
  406. sdMatchType := aMatchType;
  407. if (aMatchType = mtChar) then
  408. sdChar := aChar
  409. else if aMatchType in [mtClass, mtDupClass, mtNegClass] then
  410. sdClass := aCharClass;
  411. end;
  412. Result := FStateCount;
  413. inc(FStateCount);
  414. if FStateCount=length(FStateTable) then
  415. setlength(FStateTable,(FStateCount * 3) div 2);
  416. if not (aMatchType in [mtChar,mtTerminal,mtNone]) then FRegexType := rtRegEx;
  417. end;
  418. {--------}
  419. procedure TRegexEngine.rcClear;
  420. var
  421. i, j : integer;
  422. begin
  423. {free all items in the state transition table}
  424. for i := 0 to FStateCount-1 do begin
  425. with FStateTable[i] do begin
  426. if (sdMatchType = mtClass) or
  427. (sdMatchType = mtNegClass) and
  428. (sdClass <> nil) then
  429. begin
  430. for j := i+1 to FStateCount-1 do
  431. if (FStateTable[j].sdClass = sdClass) then
  432. FStateTable[j].sdClass := nil;
  433. FreeMem(sdClass, sizeof(TCharSet));
  434. end;
  435. // I am not sure if the next line is necessary. rcAddState set all values, so
  436. // it shouldn't be necessary to clear its contents?
  437. // FillChar(FStateTable[i],SizeOf(FStateTable[i]),#0);
  438. end;
  439. end;
  440. {clear the state transition table}
  441. FStateCount:=0;
  442. FAnchorStart := false;
  443. FAnchorEnd := false;
  444. end;
  445. {--------}
  446. procedure TRegexEngine.rcError(aIndex : integer);
  447. begin
  448. raise Exception.Create(Format(eRegexParseError,[aIndex]));
  449. end;
  450. {--------}
  451. procedure TRegexEngine.rcLevel1Optimize;
  452. var
  453. i : integer;
  454. Walker : integer;
  455. begin
  456. {level 1 optimization removes all states that have only a single
  457. no-cost move to another state}
  458. {cycle through all the state records, except for the last one}
  459. for i := 0 to FStateCount - 2 do begin
  460. {get this state}
  461. with FStateTable[i] do begin
  462. {walk the chain pointed to by the first next state, unlinking
  463. the states that are simple single no-cost moves}
  464. Walker := sdNextState1;
  465. while (FStateTable[walker].sdMatchType = mtNone) and
  466. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  467. sdNextState1 := FStateTable[walker].sdNextState1;
  468. Walker := sdNextState1;
  469. end;
  470. {walk the chain pointed to by the second next state, unlinking
  471. the states that are simple single no-cost moves}
  472. if (sdNextState2 <> UnusedState) then begin
  473. Walker := sdNextState2;
  474. while (FStateTable[walker].sdMatchType = mtNone) and
  475. (FStateTable[walker].sdNextState2 = UnusedState) do begin
  476. sdNextState2 := FStateTable[walker].sdNextState1;
  477. Walker := sdNextState2;
  478. end;
  479. end;
  480. end;
  481. end;
  482. {cycle through all the state records, except for the last one,
  483. marking unused ones--not strictly necessary but good for debugging}
  484. for i := 0 to FStateCount - 2 do begin
  485. with FStateTable[i] do begin
  486. if (sdMatchType = mtNone) and
  487. (sdNextState2 = UnusedState) then
  488. sdMatchType := mtUnused;
  489. end;
  490. end;
  491. end;
  492. {--------}
  493. function TRegexEngine.rcMatchSubString(const s : AnsiString;
  494. StartPosn : integer;
  495. var Len : integer)
  496. : boolean;
  497. var
  498. Ch : AnsiChar;
  499. State : integer;
  500. StrInx : integer;
  501. LenStr : integer;
  502. begin
  503. {assume we fail to match}
  504. Result := false;
  505. Len := StartPosn;
  506. LenStr := Length(s);
  507. {clear the deque}
  508. FHead := FCapacity div 2;
  509. FTail := FHead;
  510. {enqueue the special value to start scanning}
  511. DequeEnqueue(MustScan);
  512. {enqueue the first state}
  513. DequeEnqueue(FStartState);
  514. {prepare the string index}
  515. StrInx := StartPosn;
  516. {loop until the deque is empty or we run out of string}
  517. repeat
  518. {pop the top state from the deque}
  519. State := DequePop;
  520. {process the "must scan" state first}
  521. if (State = MustScan) then begin
  522. {if the deque is empty at this point, we might as well give up
  523. since there are no states left to process new characters}
  524. if (FHead <> FTail) then begin
  525. {if we haven't run out of string, get the character, and
  526. enqueue the "must scan" state again}
  527. if IgnoreCase then
  528. Ch := Upcase(s[StrInx])
  529. else
  530. Ch := s[StrInx];
  531. DequeEnqueue(MustScan);
  532. inc(StrInx);
  533. end;
  534. end
  535. {otherwise, process the state}
  536. else with FStateTable[State] do begin
  537. case sdMatchType of
  538. mtChar :
  539. begin
  540. {for a match of a character, enqueue the next state}
  541. if (Ch = sdChar) then
  542. DequeEnqueue(sdNextState1);
  543. end;
  544. mtAnyChar :
  545. begin
  546. {for a match of any character, enqueue the next state}
  547. if not (Ch in newline) then
  548. DequeEnqueue(sdNextState1);
  549. end;
  550. mtClass, mtDupClass :
  551. begin
  552. {for a match within a class, enqueue the next state}
  553. if (Ch in sdClass^) then
  554. DequeEnqueue(sdNextState1);
  555. end;
  556. mtNegClass :
  557. begin
  558. {for a match not within a class, enqueue the next state}
  559. if not (Ch in sdClass^) then
  560. DequeEnqueue(sdNextState1);
  561. end;
  562. mtTerminal :
  563. begin
  564. {for a terminal state, the string successfully matched
  565. if the regex had no end anchor, or we're at the end
  566. of the string or line}
  567. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  568. Result := true;
  569. Len := StrInx-1;
  570. // Exit;
  571. end;
  572. end;
  573. mtNone :
  574. begin
  575. {for free moves, push the next states onto the deque}
  576. Assert(sdNextState2 <> UnusedState,
  577. 'optimization should remove all states with one no-cost move');
  578. DequePush(sdNextState2);
  579. DequePush(sdNextState1);
  580. end;
  581. mtUnused :
  582. begin
  583. Assert(false, 'unused states shouldn''t be seen');
  584. end;
  585. end;
  586. end;
  587. until (FHead = FTail) or (StrInx > LenStr); // deque empty or end of string
  588. {if we reach this point we've either exhausted the deque or we've
  589. run out of string; if the former, the substring did not match
  590. since there are no more states. If the latter, we need to check
  591. the states left on the deque to see if one is the terminating
  592. state; if so the string matched the regular expression defined by
  593. the transition table}
  594. while (FHead <> FTail) and (StrInx<=LenStr) do begin
  595. State := DequePop;
  596. with FStateTable[State] do begin
  597. case sdMatchType of
  598. mtNone :
  599. begin
  600. {for free moves, push the next states onto the deque}
  601. Assert(sdNextState2 <> UnusedState,
  602. 'optimization should remove all states with one no-cost move');
  603. DequePush(sdNextState2);
  604. DequePush(sdNextState1);
  605. end;
  606. mtTerminal :
  607. begin
  608. {for a terminal state, the string successfully matched
  609. if the regex had no end anchor, or we're at the end
  610. of the string or line}
  611. if (not FAnchorEnd) or (ch=#0) or (FMultiLine and (ch in newline)) then begin
  612. Result := true;
  613. Len := StrInx -1;
  614. Exit;
  615. end;
  616. end;
  617. end;{case}
  618. end;
  619. end;
  620. end;
  621. {--------}
  622. function TRegexEngine.rcParseAnchorExpr : integer;
  623. begin
  624. {check for an initial '^'}
  625. if (FPosn^ = '^') then begin
  626. FAnchorStart := true;
  627. inc(FPosn);
  628. end;
  629. {parse an expression}
  630. Result := rcParseExpr;
  631. {if we were successful, check for the final '$'}
  632. if (Result <> ErrorState) then begin
  633. if (FPosn^ = '$') then begin
  634. FAnchorEnd := true;
  635. inc(FPosn);
  636. end;
  637. end;
  638. end;
  639. {--------}
  640. function TRegexEngine.rcParseAtom : integer;
  641. var
  642. MatchType : TNFAMatchType;
  643. CharClass : PCharSet;
  644. begin
  645. case FPosn^ of
  646. '(' :
  647. begin
  648. {move past the open parenthesis}
  649. inc(FPosn);
  650. {parse a complete regex between the parentheses}
  651. Result := rcParseExpr;
  652. if (Result = ErrorState) then
  653. Exit;
  654. {if the current character is not a close parenthesis,
  655. there's an error}
  656. if (FPosn^ <> ')') then begin
  657. FErrorCode := recNoCloseParen;
  658. Result := ErrorState;
  659. Exit;
  660. end;
  661. {move past the close parenthesis}
  662. inc(FPosn);
  663. {always handle expressions with parentheses as regular-expression}
  664. FRegexType := rtRegEx;
  665. end;
  666. '[' :
  667. begin
  668. {move past the open square bracket}
  669. inc(FPosn);
  670. {if the first character in the class is a '^' then the
  671. class if negated, otherwise it's a normal one}
  672. if (FPosn^ = '^') then begin
  673. inc(FPosn);
  674. MatchType := mtNegClass;
  675. end
  676. else begin
  677. MatchType := mtClass;
  678. end;
  679. {allocate the class character set and parse the character
  680. class; this will return either with an error, or when the
  681. closing square bracket is encountered}
  682. New(CharClass);
  683. CharClass^ := [];
  684. if not rcParseCharClass(CharClass) then begin
  685. Dispose(CharClass);
  686. Result := ErrorState;
  687. Exit;
  688. end;
  689. {move past the closing square bracket}
  690. Assert(FPosn^ = ']',
  691. 'the rcParseCharClass terminated without finding a "]"');
  692. inc(FPosn);
  693. {add a new state for the character class}
  694. Result := rcAddState(MatchType, #0, CharClass,
  695. NewFinalState, UnusedState);
  696. end;
  697. '.' :
  698. begin
  699. {move past the period metacharacter}
  700. inc(FPosn);
  701. {add a new state for the 'any character' token}
  702. Result := rcAddState(mtAnyChar, #0, nil,
  703. NewFinalState, UnusedState);
  704. end;
  705. '\' :
  706. begin
  707. if (FPosn+1)^ in ['d','D','s','S','w','W'] then begin
  708. New(CharClass);
  709. CharClass^ := [];
  710. if not rcParseCharRange(CharClass) then begin
  711. Dispose(CharClass);
  712. Result := ErrorState;
  713. Exit;
  714. end;
  715. Result := rcAddState(mtClass, #0, CharClass,
  716. NewFinalState, UnusedState);
  717. end
  718. else
  719. Result := rcParseChar;
  720. end;
  721. else
  722. {otherwise parse a single character}
  723. Result := rcParseChar;
  724. end;{case}
  725. end;
  726. {--------}
  727. function TRegexEngine.rcParseCCChar(out EscapeChar : Boolean) : AnsiChar;
  728. begin
  729. EscapeChar:=False;
  730. {if we hit the end of the string, it's an error}
  731. if (FPosn^ = #0) then begin
  732. FErrorCode := recSuddenEnd;
  733. Result := #0;
  734. Exit;
  735. end;
  736. {if the current AnsiChar is a metacharacter (at least in terms of a
  737. character class), it's an error}
  738. if FPosn^ in [']', '-'] then begin
  739. FErrorCode := recMetaChar;
  740. Result := #0;
  741. Exit;
  742. end;
  743. {otherwise return the character and advance past it}
  744. if (FPosn^ = '\') then
  745. {..it's an escaped character: get the next character instead}
  746. begin
  747. inc(FPosn);
  748. EscapeChar:=True;
  749. Result := rcReturnEscapeChar;
  750. end
  751. else
  752. Result := FPosn^;
  753. inc(FPosn);
  754. end;
  755. {--------}
  756. function TRegexEngine.rcParseChar : integer;
  757. var
  758. Ch : AnsiChar;
  759. begin
  760. {if we hit the end of the string, it's an error}
  761. if (FPosn^ = #0) then begin
  762. Result := ErrorState;
  763. FErrorCode := recSuddenEnd;
  764. Exit;
  765. end;
  766. {if the current AnsiChar is one of the metacharacters, it's an error}
  767. if FPosn^ in MetaCharacters then begin
  768. Result := ErrorState;
  769. FErrorCode := recMetaChar;
  770. Exit;
  771. end;
  772. {otherwise add a state for the character}
  773. {..if it's an escaped character: get the next character instead}
  774. if (FPosn^ = '\') then
  775. begin
  776. inc(FPosn);
  777. ch := rcReturnEscapeChar;
  778. FRegexType := rtRegEx;
  779. end
  780. else
  781. ch :=FPosn^;
  782. if IgnoreCase then
  783. Ch := Upcase(ch);
  784. Result := rcAddState(mtChar, Ch, nil, NewFinalState, UnusedState);
  785. inc(FPosn);
  786. end;
  787. {--------}
  788. function TRegexEngine.rcParseCharClass(aClass : PCharSet) : boolean;
  789. begin
  790. {assume we can't parse a character class properly}
  791. Result := false;
  792. {parse a character range; if we can't there was an error and the
  793. caller will take care of it}
  794. if not rcParseCharRange(aClass) then
  795. Exit;
  796. {if the current character was not the right bracket, parse another
  797. character class (note: we're removing the tail recursion here)}
  798. while (FPosn^ <> ']') do begin
  799. if not rcParseCharRange(aClass) then
  800. Exit;
  801. end;
  802. {if we reach here we were successful}
  803. Result := true;
  804. end;
  805. {--------}
  806. function TRegexEngine.rcParseCharRange(aClass : PCharSet) : boolean;
  807. var
  808. StartChar : AnsiChar;
  809. EndChar : AnsiChar;
  810. Ch : AnsiChar;
  811. EscChar : Boolean;
  812. begin
  813. {assume we can't parse a character range properly}
  814. Result := false;
  815. {parse a single character; if it's null there was an error}
  816. StartChar := rcParseCCChar(EscChar);
  817. if (StartChar = #0) then
  818. Exit;
  819. if EscChar then
  820. begin
  821. case StartChar of
  822. 'd' : aClass^ := aClass^ + cs_digits;
  823. 'D' : aClass^ := aClass^ + cs_allchars-cs_digits;
  824. 's' : aClass^ := aClass^ + cs_whitespace;
  825. 'S' : aClass^ := aClass^ + cs_allchars-cs_whitespace;
  826. 'w' : aClass^ := aClass^ + cs_wordchars;
  827. 'W' : aClass^ := aClass^ + cs_allchars-cs_wordchars
  828. else
  829. EscChar := False;
  830. end;
  831. if EscChar then
  832. begin
  833. Result := True;
  834. Exit;
  835. end;
  836. end;
  837. {if the current character is not a dash, the range consisted of a
  838. single character}
  839. if (FPosn^ <> '-') then begin
  840. if IgnoreCase then
  841. Include(aClass^, Upcase(StartChar))
  842. else
  843. Include(aClass^, StartChar)
  844. end
  845. {otherwise it's a real range, so get the character at the end of the
  846. range; if that's null, there was an error}
  847. else begin
  848. inc(FPosn); {move past the '-'}
  849. EndChar := rcParseCCChar(EscChar);
  850. if (EndChar = #0) then
  851. Exit;
  852. {build the range as a character set}
  853. if (StartChar > EndChar) then begin
  854. Ch := StartChar;
  855. StartChar := EndChar;
  856. EndChar := Ch;
  857. end;
  858. for Ch := StartChar to EndChar do begin
  859. Include(aClass^, Ch);
  860. if IgnoreCase then
  861. Include(aClass^, Upcase(Ch));
  862. end;
  863. end;
  864. {if we reach here we were successful}
  865. Result := true;
  866. end;
  867. {--------}
  868. function TRegexEngine.rcParseExpr : integer;
  869. var
  870. StartState1 : integer;
  871. StartState2 : integer;
  872. EndState1 : integer;
  873. OverallStartState : integer;
  874. begin
  875. {assume the worst}
  876. Result := ErrorState;
  877. {parse an initial term}
  878. StartState1 := rcParseTerm;
  879. if (StartState1 = ErrorState) then
  880. Exit;
  881. {if the current character is *not* a pipe character, no alternation
  882. is present so return the start state of the initial term as our
  883. start state}
  884. if (FPosn^ <> '|') then
  885. Result := StartState1
  886. {otherwise, we need to parse another expr and join the two together
  887. in the transition table}
  888. else begin
  889. {advance past the pipe}
  890. inc(FPosn);
  891. {the initial term's end state does not exist yet (although there
  892. is a state in the term that points to it), so create it}
  893. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  894. {for the OR construction we need a new initial state: it will
  895. point to the initial term and the second just-about-to-be-parsed
  896. expr}
  897. OverallStartState := rcAddState(mtNone, #0, nil,
  898. UnusedState, UnusedState);
  899. {parse another expr}
  900. StartState2 := rcParseExpr;
  901. if (StartState2 = ErrorState) then
  902. Exit;
  903. {alter the state state for the overall expr so that the second
  904. link points to the start of the second expr}
  905. Result := rcSetState(OverallStartState, StartState1, StartState2);
  906. {now set the end state for the initial term to point to the final
  907. end state for the second expr and the overall expr}
  908. rcSetState(EndState1, FStateCount, UnusedState);
  909. {always handle expressions with a pipe as regular-expression}
  910. FRegexType := rtRegEx;
  911. end;
  912. end;
  913. {--------}
  914. function TRegexEngine.rcParseFactor : integer;
  915. var
  916. StartStateAtom : integer;
  917. EndStateAtom : integer;
  918. TempEndStateAtom : integer;
  919. Int : AnsiString;
  920. n,m,nState : integer;
  921. i : integer;
  922. begin
  923. {assume the worst}
  924. Result := ErrorState;
  925. {first parse an atom}
  926. StartStateAtom := rcParseAtom;
  927. if (StartStateAtom = ErrorState) then
  928. Exit;
  929. {check for a closure operator}
  930. case FPosn^ of
  931. '?' : begin
  932. {move past the ? operator}
  933. inc(FPosn);
  934. {the atom's end state doesn't exist yet, so create one}
  935. EndStateAtom := rcAddState(mtNone, #0, nil,
  936. UnusedState, UnusedState);
  937. {create a new start state for the overall regex}
  938. Result := rcAddState(mtNone, #0, nil,
  939. StartStateAtom, EndStateAtom);
  940. {make sure the new end state points to the next unused
  941. state}
  942. rcSetState(EndStateAtom, FStateCount, UnusedState);
  943. end;
  944. '*' : begin
  945. {move past the * operator}
  946. inc(FPosn);
  947. {the atom's end state doesn't exist yet, so create one;
  948. it'll be the start of the overall regex subexpression}
  949. Result := rcAddState(mtNone, #0, nil,
  950. NewFinalState, StartStateAtom);
  951. end;
  952. '+' : begin
  953. {move past the + operator}
  954. inc(FPosn);
  955. {the atom's end state doesn't exist yet, so create one}
  956. rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom);
  957. {the start of the overall regex subexpression will be the
  958. atom's start state}
  959. Result := StartStateAtom;
  960. end;
  961. '{' : begin // {n,m}
  962. {move past the brace }
  963. inc(FPosn);
  964. {Parse the value of n}
  965. Int := '';
  966. while not (FPosn^ in [',','}',#0]) do
  967. begin
  968. int := int+FPosn^;
  969. inc(FPosn);
  970. end;
  971. if FPosn^ = #0 then exit; // No end-brace or comma -> invalid regex
  972. if int <> '' then
  973. n := StrToIntDef(Int,-2)
  974. else
  975. n := -1; // if n is 'empty', set it to -1
  976. if n = -2 then exit; // Invalid value for n -> invalid RegEx
  977. if FPosn^ <> '}' then
  978. begin
  979. {move past the , }
  980. inc(FPosn);
  981. {Parse the value of m}
  982. Int := '';
  983. while not (FPosn^ in ['}',#0]) do
  984. begin
  985. int := int+FPosn^;
  986. inc(FPosn);
  987. end;
  988. if FPosn^ <> '}' then exit; // No end-brace -> invalid regex
  989. if int <> '' then m := StrToIntDef(Int,-2)
  990. else m := -1;
  991. if m = -2 then exit; // Invalid RegEx
  992. end
  993. else
  994. m := -3;
  995. {move past the brace }
  996. inc(FPosn);
  997. if (n=0) and (m=-1) then
  998. {the atom's end state doesn't exist yet, so create one;
  999. it'll be the start of the overall regex subexpression}
  1000. Result := rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom)
  1001. else
  1002. begin
  1003. EndStateAtom := FStateCount-1;
  1004. TempEndStateAtom:=StartStateAtom;
  1005. for i := 1 to n-1 do
  1006. begin
  1007. TempEndStateAtom:=FStateCount;
  1008. for nState:=StartStateAtom to EndStateAtom do
  1009. begin
  1010. FStateTable[FStateCount]:=FStateTable[nState];
  1011. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  1012. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) *i;
  1013. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  1014. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) *i;
  1015. if FStateTable[FStateCount].sdMatchType = mtClass then
  1016. FStateTable[FStateCount].sdMatchType := mtDupClass;
  1017. inc(FStateCount);
  1018. if FStateCount=length(FStateTable) then
  1019. setlength(FStateTable,(FStateCount * 3) div 2);
  1020. end;
  1021. end;
  1022. for i := n to m-1 do
  1023. begin
  1024. rcAddState(mtNone, #0, nil, NewFinalState, EndStateAtom+(EndStateAtom-StartStateAtom+1) * (m-1) + (m-n)+1);
  1025. TempEndStateAtom:=FStateCount;
  1026. for nState:=StartStateAtom to EndStateAtom do
  1027. begin
  1028. FStateTable[FStateCount]:=FStateTable[nState];
  1029. if FStateTable[FStateCount].sdNextState1 in [StartStateAtom..EndStateAtom+1] then
  1030. FStateTable[FStateCount].sdNextState1 := i+FStateTable[FStateCount].sdNextState1+ (EndStateAtom-StartStateAtom) * i+(i-n+1);
  1031. if FStateTable[FStateCount].sdNextState2 in [StartStateAtom..EndStateAtom+1] then
  1032. FStateTable[FStateCount].sdNextState2 := i+FStateTable[FStateCount].sdNextState2 + (EndStateAtom-StartStateAtom) * i+(i-n+1);
  1033. if FStateTable[FStateCount].sdMatchType = mtClass then
  1034. FStateTable[FStateCount].sdMatchType := mtDupClass;
  1035. inc(FStateCount);
  1036. if FStateCount=length(FStateTable) then
  1037. setlength(FStateTable,(FStateCount * 3) div 2);
  1038. end;
  1039. end;
  1040. if m = -1 then
  1041. rcAddState(mtNone, #0, nil, NewFinalState, TempEndStateAtom);
  1042. Result := StartStateAtom;
  1043. end;
  1044. {always handle expressions with braces as regular-expression}
  1045. FRegexType := rtRegEx;
  1046. end;
  1047. else
  1048. Result := StartStateAtom;
  1049. end;{case}
  1050. end;
  1051. {--------}
  1052. function TRegexEngine.rcParseTerm : integer;
  1053. var
  1054. StartState2 : integer;
  1055. EndState1 : integer;
  1056. begin
  1057. {parse an initial factor, the state number returned will also be our
  1058. return state number}
  1059. Result := rcParseFactor;
  1060. if (Result = ErrorState) then
  1061. Exit;
  1062. {Note: we have to "break the grammar" here. We've parsed a regular
  1063. subexpression and we're possibly following on with another
  1064. regular subexpression. There's no nice operator to key off
  1065. for concatenation: we just have to know that for
  1066. concatenating two subexpressions, the current character will
  1067. be
  1068. - an open parenthesis
  1069. - an open square bracket
  1070. - an any AnsiChar operator
  1071. - a character that's not a metacharacter
  1072. i.e., the three possibilities for the start of an "atom" in
  1073. our grammar}
  1074. if (FPosn^ = '(') or
  1075. (FPosn^ = '[') or
  1076. (FPosn^ = '.') or
  1077. ((FPosn^ <> #0) and not (FPosn^ in MetaCharacters)) then begin
  1078. {the initial factor's end state does not exist yet (although there
  1079. is a state in the term that points to it), so create it}
  1080. EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
  1081. {parse another term}
  1082. StartState2 := rcParseTerm;
  1083. if (StartState2 = ErrorState) then begin
  1084. Result := ErrorState;
  1085. Exit;
  1086. end;
  1087. {join the first factor to the second term}
  1088. rcSetState(EndState1, StartState2, UnusedState);
  1089. end;
  1090. end;
  1091. procedure TRegexEngine.WriteTable;
  1092. var i : integer;
  1093. begin
  1094. for i := 0 to FStateCount-1 do with FStateTable[i] do
  1095. writeln('s:',i,' mt:',sdMatchType ,' ns1:',sdNextState1,' ns2:',sdNextState2,' AnsiChar:',sdChar);
  1096. end;
  1097. procedure TRegexEngine.DequeEnqueue(aValue: integer);
  1098. begin
  1099. FList[FTail] := aValue;
  1100. inc(FTail);
  1101. if (FTail = FCapacity) then
  1102. FTail := 0
  1103. else if (FTail = FHead) then
  1104. DequeGrow;
  1105. end;
  1106. procedure TRegexEngine.DequePush(aValue: integer);
  1107. begin
  1108. if (FHead = 0) then
  1109. FHead := FCapacity;
  1110. dec(FHead);
  1111. FList[FHead] := aValue;
  1112. if (FTail = FHead) then
  1113. DequeGrow;
  1114. end;
  1115. function TRegexEngine.DequePop: integer;
  1116. begin
  1117. Result := FList[FHead];
  1118. inc(FHead);
  1119. if (FHead = FCapacity) then
  1120. FHead := 0;
  1121. end;
  1122. procedure TRegexEngine.DequeGrow;
  1123. var
  1124. OldCount : integer;
  1125. i, j : integer;
  1126. begin
  1127. {grow the list by 50%}
  1128. OldCount := FCapacity;
  1129. FCapacity:=(OldCount * 3) div 2;
  1130. SetLength(FList,FCapacity);
  1131. {expand the data into the increased space, maintaining the deque}
  1132. if (FHead = 0) then
  1133. FTail := OldCount
  1134. else begin
  1135. j := FCapacity;
  1136. for i := pred(OldCount) downto FHead do begin
  1137. dec(j);
  1138. FList[j] := FList[i]
  1139. end;
  1140. FHead := j;
  1141. end;
  1142. end;
  1143. function TRegexEngine.rcReturnEscapeChar: AnsiChar;
  1144. begin
  1145. case FPosn^ of
  1146. 't' : Result := #9;
  1147. 'n' : Result := #10;
  1148. 'r' : Result := #13;
  1149. 'f' : Result := #12;
  1150. 'a' : Result := #7;
  1151. else
  1152. Result := FPosn^;
  1153. end;
  1154. end;
  1155. {--------}
  1156. procedure TRegexEngine.rcSetIgnoreCase(aValue : boolean);
  1157. begin
  1158. if (aValue <> FIgnoreCase) then begin
  1159. rcClear;
  1160. FIgnoreCase := aValue;
  1161. end;
  1162. end;
  1163. {--------}
  1164. procedure TRegexEngine.rcSetRegexStr(const aRegexStr : AnsiString);
  1165. begin
  1166. if (aRegexStr <> FRegexStr) then begin
  1167. rcClear;
  1168. FRegexStr := aRegexStr;
  1169. end;
  1170. end;
  1171. {--------}
  1172. function TRegexEngine.rcSetState(aState : integer;
  1173. aNextState1: integer;
  1174. aNextState2: integer) : integer;
  1175. begin
  1176. Assert((0 <= aState) and (aState < FStateCount),
  1177. 'trying to change an invalid state');
  1178. {get the state record and change the transition information}
  1179. FStateTable[aState].sdNextState1 := aNextState1;
  1180. FStateTable[aState].sdNextState2 := aNextState2;
  1181. Result := aState;
  1182. end;
  1183. {--------}
  1184. procedure TRegexEngine.rcSetUpcase(aValue : TUpcaseFunc);
  1185. begin
  1186. if not Assigned(aValue) then
  1187. FUpcase := SystemUpcase
  1188. else
  1189. FUpcase := aValue;
  1190. end;
  1191. procedure TRegexEngine.rcSetMultiLine(aValue: Boolean);
  1192. begin
  1193. FMultiLine:=aValue;
  1194. end;
  1195. {====================================================================}
  1196. end.