PerlRegEx.pas 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963
  1. {**************************************************************************************************}
  2. { }
  3. { Perl Regular Expressions VCL component }
  4. { }
  5. { The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); }
  6. { you may not use this file except in compliance with the License. You may obtain a copy of the }
  7. { License at http://www.mozilla.org/MPL/ }
  8. { }
  9. { Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF }
  10. { ANY KIND, either express or implied. See the License for the specific language governing rights }
  11. { and limitations under the License. }
  12. { }
  13. { The Original Code is PerlRegEx.pas. }
  14. { }
  15. { The Initial Developer of the Original Code is Jan Goyvaerts. }
  16. { Portions created by Jan Goyvaerts are Copyright (C) 1999, 2005, 2008, 2010 Jan Goyvaerts. }
  17. { All rights reserved. }
  18. { }
  19. { Design & implementation, by Jan Goyvaerts, 1999, 2005, 2008, 2010 }
  20. { }
  21. { TPerlRegEx is available at http://www.regular-expressions.info/delphi.html }
  22. { }
  23. {**************************************************************************************************}
  24. unit PerlRegEx;
  25. interface
  26. uses
  27. Windows, Messages, SysUtils, Classes,
  28. pcre;
  29. type
  30. TPerlRegExOptions = set of (
  31. preCaseLess, // /i -> Case insensitive
  32. preMultiLine, // /m -> ^ and $ also match before/after a newline, not just at the beginning and the end of the string
  33. preSingleLine, // /s -> Dot matches any character, including \n (newline). Otherwise, it matches anything except \n
  34. preExtended, // /x -> Allow regex to contain extra whitespace, newlines and Perl-style comments, all of which will be filtered out
  35. preAnchored, // /A -> Successful match can only occur at the start of the subject or right after the previous match
  36. preUnGreedy, // Repeat operators (+, *, ?) are not greedy by default (i.e. they try to match the minimum number of characters instead of the maximum)
  37. preNoAutoCapture // (group) is a non-capturing group; only named groups capture
  38. );
  39. type
  40. TPerlRegExState = set of (
  41. preNotBOL, // Not Beginning Of Line: ^ does not match at the start of Subject
  42. preNotEOL, // Not End Of Line: $ does not match at the end of Subject
  43. preNotEmpty // Empty matches not allowed
  44. );
  45. const
  46. // Maximum number of subexpressions (backreferences)
  47. // Subexpressions are created by placing round brackets in the regex, and are referenced by \1, \2, ...
  48. // In Perl, they are available as $1, $2, ... after the regex matched; with TPerlRegEx, use the Subexpressions property
  49. // You can also insert \1, \2, ... in the replacement string; \0 is the complete matched expression
  50. MAX_SUBEXPRESSIONS = 99;
  51. {$IFDEF UNICODE}
  52. // All implicit string casts have been verified to be correct
  53. {$WARN IMPLICIT_STRING_CAST OFF}
  54. // Use UTF-8 in Delphi 2009 and later, so Unicode strings are handled correctly.
  55. // PCRE does not support UTF-16
  56. type
  57. PCREString = UTF8String;
  58. {$ELSE UNICODE}
  59. // Use AnsiString in Delphi 2007 and earlier
  60. type
  61. PCREString = AnsiString;
  62. {$ENDIF UNICODE}
  63. type
  64. TPerlRegExReplaceEvent = procedure(Sender: TObject; var ReplaceWith: PCREString) of object;
  65. type
  66. TPerlRegEx = class
  67. private // *** Property storage, getters and setters
  68. FCompiled, FStudied: Boolean;
  69. FOptions: TPerlRegExOptions;
  70. FState: TPerlRegExState;
  71. FRegEx, FReplacement, FSubject: PCREString;
  72. FStart, FStop: Integer;
  73. FOnMatch: TNotifyEvent;
  74. FOnReplace: TPerlRegExReplaceEvent;
  75. function GetMatchedText: PCREString;
  76. function GetMatchedLength: Integer;
  77. function GetMatchedOffset: Integer;
  78. procedure SetOptions(Value: TPerlRegExOptions);
  79. procedure SetRegEx(const Value: PCREString);
  80. function GetGroupCount: Integer;
  81. function GetGroups(Index: Integer): PCREString;
  82. function GetGroupLengths(Index: Integer): Integer;
  83. function GetGroupOffsets(Index: Integer): Integer;
  84. procedure SetSubject(const Value: PCREString);
  85. procedure SetStart(const Value: Integer);
  86. procedure SetStop(const Value: Integer);
  87. function GetFoundMatch: Boolean;
  88. private // *** Variables used by PCRE
  89. Offsets: array[0..(MAX_SUBEXPRESSIONS+1)*3] of Integer;
  90. OffsetCount: Integer;
  91. pcreOptions: Integer;
  92. pattern, hints, chartable: Pointer;
  93. FSubjectPChar: PAnsiChar;
  94. FHasStoredGroups: Boolean;
  95. FStoredGroups: array of PCREString;
  96. function GetSubjectLeft: PCREString;
  97. function GetSubjectRight: PCREString;
  98. protected
  99. procedure CleanUp;
  100. // Dispose off whatever we created, so we can start over. Called automatically when needed, so it is not made public
  101. procedure ClearStoredGroups;
  102. public
  103. constructor Create;
  104. // Come to life
  105. destructor Destroy; override;
  106. // Clean up after ourselves
  107. class function EscapeRegExChars(const S: string): string;
  108. // Escapes regex characters in S so that the regex engine can be used to match S as plain text
  109. procedure Compile;
  110. // Compile the regex. Called automatically by Match
  111. procedure Study;
  112. // Study the regex. Studying takes time, but will make the execution of the regex a lot faster.
  113. // Call study if you will be using the same regex many times
  114. function Match: Boolean;
  115. // Attempt to match the regex, starting the attempt from the beginning of Subject
  116. function MatchAgain: Boolean;
  117. // Attempt to match the regex to the remainder of Subject after the previous match (as indicated by Start)
  118. function Replace: PCREString;
  119. // Replace matched expression in Subject with ComputeReplacement. Returns the actual replacement text from ComputeReplacement
  120. function ReplaceAll: Boolean;
  121. // Repeat MatchAgain and Replace until you drop. Returns True if anything was replaced at all.
  122. function ComputeReplacement: PCREString;
  123. // Returns Replacement with backreferences filled in
  124. procedure StoreGroups;
  125. // Stores duplicates of Groups[] so they and ComputeReplacement will still return the proper strings
  126. // even if FSubject is changed or cleared
  127. function NamedGroup(const Name: PCREString): Integer;
  128. // Returns the index of the named group Name
  129. procedure Split(Strings: TStrings; Limit: Integer);
  130. // Split Subject along regex matches. Capturing groups are ignored.
  131. procedure SplitCapture(Strings: TStrings; Limit: Integer); overload;
  132. procedure SplitCapture(Strings: TStrings; Limit: Integer; Offset: Integer); overload;
  133. // Split Subject along regex matches. Capturing groups are added to Strings as well.
  134. property Compiled: Boolean read FCompiled;
  135. // True if the RegEx has already been compiled.
  136. property FoundMatch: Boolean read GetFoundMatch;
  137. // Returns True when Matched* and Group* indicate a match
  138. property Studied: Boolean read FStudied;
  139. // True if the RegEx has already been studied
  140. property MatchedText: PCREString read GetMatchedText;
  141. // The matched text
  142. property MatchedLength: Integer read GetMatchedLength;
  143. // Length of the matched text
  144. property MatchedOffset: Integer read GetMatchedOffset;
  145. // Character offset in the Subject string at which MatchedText starts
  146. property Start: Integer read FStart write SetStart;
  147. // Starting position in Subject from which MatchAgain begins
  148. property Stop: Integer read FStop write SetStop;
  149. // Last character in Subject that Match and MatchAgain search through
  150. property State: TPerlRegExState read FState write FState;
  151. // State of Subject
  152. property GroupCount: Integer read GetGroupCount;
  153. // Number of matched capturing groups
  154. property Groups[Index: Integer]: PCREString read GetGroups;
  155. // Text matched by capturing groups
  156. property GroupLengths[Index: Integer]: Integer read GetGroupLengths;
  157. // Lengths of the text matched by capturing groups
  158. property GroupOffsets[Index: Integer]: Integer read GetGroupOffsets;
  159. // Character offsets in Subject at which the capturing group matches start
  160. property Subject: PCREString read FSubject write SetSubject;
  161. // The string on which Match() will try to match RegEx
  162. property SubjectLeft: PCREString read GetSubjectLeft;
  163. // Part of the subject to the left of the match
  164. property SubjectRight: PCREString read GetSubjectRight;
  165. // Part of the subject to the right of the match
  166. public
  167. property Options: TPerlRegExOptions read FOptions write SetOptions;
  168. // Options
  169. property RegEx: PCREString read FRegEx write SetRegEx;
  170. // The regular expression to be matched
  171. property Replacement: PCREString read FReplacement write FReplacement;
  172. // Text to replace matched expression with. \number and $number backreferences will be substituted with Groups
  173. // TPerlRegEx supports the "JGsoft" replacement text flavor as explained at http://www.regular-expressions.info/refreplace.html
  174. property OnMatch: TNotifyEvent read FOnMatch write FOnMatch;
  175. // Triggered by Match and MatchAgain after a successful match
  176. property OnReplace: TPerlRegExReplaceEvent read FOnReplace write FOnReplace;
  177. // Triggered by Replace and ReplaceAll just before the replacement is done, allowing you to determine the new PCREString
  178. end;
  179. {
  180. You can add TPerlRegEx instances to a TPerlRegExList to match them all together on the same subject,
  181. as if they were one regex regex1|regex2|regex3|...
  182. TPerlRegExList does not own the TPerlRegEx components, just like a TList
  183. If a TPerlRegEx has been added to a TPerlRegExList, it should not be used in any other situation
  184. until it is removed from the list
  185. }
  186. type
  187. TPerlRegExList = class
  188. private
  189. FList: TList;
  190. FSubject: PCREString;
  191. FMatchedRegEx: TPerlRegEx;
  192. FStart, FStop: Integer;
  193. function GetRegEx(Index: Integer): TPerlRegEx;
  194. procedure SetRegEx(Index: Integer; Value: TPerlRegEx);
  195. procedure SetSubject(const Value: PCREString);
  196. procedure SetStart(const Value: Integer);
  197. procedure SetStop(const Value: Integer);
  198. function GetCount: Integer;
  199. protected
  200. procedure UpdateRegEx(ARegEx: TPerlRegEx);
  201. public
  202. constructor Create;
  203. destructor Destroy; override;
  204. public
  205. function Add(ARegEx: TPerlRegEx): Integer;
  206. procedure Clear;
  207. procedure Delete(Index: Integer);
  208. function IndexOf(ARegEx: TPerlRegEx): Integer;
  209. procedure Insert(Index: Integer; ARegEx: TPerlRegEx);
  210. public
  211. function Match: Boolean;
  212. function MatchAgain: Boolean;
  213. property RegEx[Index: Integer]: TPerlRegEx read GetRegEx write SetRegEx;
  214. property Count: Integer read GetCount;
  215. property Subject: PCREString read FSubject write SetSubject;
  216. property Start: Integer read FStart write SetStart;
  217. property Stop: Integer read FStop write SetStop;
  218. property MatchedRegEx: TPerlRegEx read FMatchedRegEx;
  219. end;
  220. implementation
  221. { ********* Unit support routines ********* }
  222. function FirstCap(const S: string): string;
  223. begin
  224. if S = '' then Result := ''
  225. else begin
  226. Result := AnsiLowerCase(S);
  227. {$IFDEF UNICODE}
  228. CharUpperBuffW(@Result[1], 1);
  229. {$ELSE}
  230. CharUpperBuffA(@Result[1], 1);
  231. {$ENDIF}
  232. end
  233. end;
  234. function InitialCaps(const S: string): string;
  235. var
  236. I: Integer;
  237. Up: Boolean;
  238. begin
  239. Result := AnsiLowerCase(S);
  240. Up := True;
  241. {$IFDEF UNICODE}
  242. for I := 1 to Length(Result) do begin
  243. case Result[I] of
  244. #0..'&', '(', '*', '+', ',', '-', '.', '?', '<', '[', '{', #$00B7:
  245. Up := True
  246. else
  247. if Up and (Result[I] <> '''') then begin
  248. CharUpperBuffW(@Result[I], 1);
  249. Up := False
  250. end
  251. end;
  252. end;
  253. {$ELSE UNICODE}
  254. if SysLocale.FarEast then begin
  255. I := 1;
  256. while I <= Length(Result) do begin
  257. if Result[I] in LeadBytes then begin
  258. Inc(I, 2)
  259. end
  260. else begin
  261. if Result[I] in [#0..'&', '('..'.', '?', '<', '[', '{'] then Up := True
  262. else if Up and (Result[I] <> '''') then begin
  263. CharUpperBuffA(@Result[I], 1);
  264. Result[I] := UpperCase(Result[I])[1];
  265. Up := False
  266. end;
  267. Inc(I)
  268. end
  269. end
  270. end
  271. else
  272. for I := 1 to Length(Result) do begin
  273. if Result[I] in [#0..'&', '('..'.', '?', '<', '[', '{', #$B7] then Up := True
  274. else if Up and (Result[I] <> '''') then begin
  275. CharUpperBuffA(@Result[I], 1);
  276. Result[I] := AnsiUpperCase(Result[I])[1];
  277. Up := False
  278. end
  279. end;
  280. {$ENDIF UNICODE}
  281. end;
  282. { ********* TPerlRegEx component ********* }
  283. procedure TPerlRegEx.CleanUp;
  284. begin
  285. FCompiled := False; FStudied := False;
  286. pcre_dispose(pattern, hints, nil);
  287. pattern := nil;
  288. hints := nil;
  289. ClearStoredGroups;
  290. OffsetCount := 0;
  291. end;
  292. procedure TPerlRegEx.ClearStoredGroups;
  293. begin
  294. FHasStoredGroups := False;
  295. FStoredGroups := nil;
  296. end;
  297. procedure TPerlRegEx.Compile;
  298. var
  299. Error: PAnsiChar;
  300. ErrorOffset: Integer;
  301. begin
  302. if FRegEx = '' then
  303. raise Exception.Create('TPerlRegEx.Compile() - Please specify a regular expression in RegEx first');
  304. CleanUp;
  305. Pattern := pcre_compile(PAnsiChar(FRegEx), pcreOptions, @Error, @ErrorOffset, chartable);
  306. if Pattern = nil then
  307. raise Exception.Create(Format('TPerlRegEx.Compile() - Error in regex at offset %d: %s', [ErrorOffset, AnsiString(Error)]));
  308. FCompiled := True
  309. end;
  310. (* Backreference overview:
  311. Assume there are 13 backreferences:
  312. Text TPerlRegex .NET Java ECMAScript
  313. $17 $1 + "7" "$17" $1 + "7" $1 + "7"
  314. $017 $1 + "7" "$017" $1 + "7" $1 + "7"
  315. $12 $12 $12 $12 $12
  316. $012 $1 + "2" $12 $12 $1 + "2"
  317. ${1}2 $1 + "2" $1 + "2" error "${1}2"
  318. $$ "$" "$" error "$"
  319. \$ "$" "\$" "$" "\$"
  320. *)
  321. function TPerlRegEx.ComputeReplacement: PCREString;
  322. var
  323. Mode: AnsiChar;
  324. S: PCREString;
  325. I, J, N: Integer;
  326. procedure ReplaceBackreference(Number: Integer);
  327. var
  328. Backreference: PCREString;
  329. begin
  330. Delete(S, I, J-I);
  331. if Number <= GroupCount then begin
  332. Backreference := Groups[Number];
  333. if Backreference <> '' then begin
  334. // Ignore warnings; converting to UTF-8 does not cause data loss
  335. case Mode of
  336. 'L', 'l': Backreference := AnsiLowerCase(Backreference);
  337. 'U', 'u': Backreference := AnsiUpperCase(Backreference);
  338. 'F', 'f': Backreference := FirstCap(Backreference);
  339. 'I', 'i': Backreference := InitialCaps(Backreference);
  340. end;
  341. if S <> '' then begin
  342. Insert(Backreference, S, I);
  343. I := I + Length(Backreference);
  344. end
  345. else begin
  346. S := Backreference;
  347. I := MaxInt;
  348. end
  349. end;
  350. end
  351. end;
  352. procedure ProcessBackreference(NumberOnly, Dollar: Boolean);
  353. var
  354. Number, Number2: Integer;
  355. Group: PCREString;
  356. begin
  357. Number := -1;
  358. if (J <= Length(S)) and (S[J] in ['0'..'9']) then begin
  359. // Get the number of the backreference
  360. Number := Ord(S[J]) - Ord('0');
  361. Inc(J);
  362. if (J <= Length(S)) and (S[J] in ['0'..'9']) then begin
  363. // Expand it to two digits only if that would lead to a valid backreference
  364. Number2 := Number*10 + Ord(S[J]) - Ord('0');
  365. if Number2 <= GroupCount then begin
  366. Number := Number2;
  367. Inc(J)
  368. end;
  369. end;
  370. end
  371. else if not NumberOnly then begin
  372. if Dollar and (J < Length(S)) and (S[J] = '{') then begin
  373. // Number or name in curly braces
  374. Inc(J);
  375. case S[J] of
  376. '0'..'9': begin
  377. Number := Ord(S[J]) - Ord('0');
  378. Inc(J);
  379. while (J <= Length(S)) and (S[J] in ['0'..'9']) do begin
  380. Number := Number*10 + Ord(S[J]) - Ord('0');
  381. Inc(J)
  382. end;
  383. end;
  384. 'A'..'Z', 'a'..'z', '_': begin
  385. Inc(J);
  386. while (J <= Length(S)) and (S[J] in ['A'..'Z', 'a'..'z', '0'..'9', '_']) do Inc(J);
  387. if (J <= Length(S)) and (S[J] = '}') then begin
  388. Group := Copy(S, I+2, J-I-2);
  389. Number := NamedGroup(Group);
  390. end
  391. end;
  392. end;
  393. if (J > Length(S)) or (S[J] <> '}') then Number := -1
  394. else Inc(J)
  395. end
  396. else if Dollar and (S[J] = '_') then begin
  397. // $_ (whole subject)
  398. Delete(S, I, J+1-I);
  399. Insert(Subject, S, I);
  400. I := I + Length(Subject);
  401. Exit;
  402. end
  403. else case S[J] of
  404. '&': begin
  405. // \& or $& (whole regex match)
  406. Number := 0;
  407. Inc(J);
  408. end;
  409. '+': begin
  410. // \+ or $+ (highest-numbered participating group)
  411. Number := GroupCount;
  412. Inc(J);
  413. end;
  414. '`': begin
  415. // \` or $` (backtick; subject to the left of the match)
  416. Delete(S, I, J+1-I);
  417. Insert(SubjectLeft, S, I);
  418. I := I + Offsets[0] - 1;
  419. Exit;
  420. end;
  421. '''': begin
  422. // \' or $' (straight quote; subject to the right of the match)
  423. Delete(S, I, J+1-I);
  424. Insert(SubjectRight, S, I);
  425. I := I + Length(Subject) - Offsets[1];
  426. Exit;
  427. end
  428. end;
  429. end;
  430. if Number >= 0 then ReplaceBackreference(Number)
  431. else Inc(I)
  432. end;
  433. begin
  434. S := FReplacement;
  435. I := 1;
  436. while I < Length(S) do begin
  437. case S[I] of
  438. '\': begin
  439. J := I + 1;
  440. Assert(J <= Length(S), 'CHECK: We let I stop one character before the end, so J cannot point beyond the end of the PCREString here');
  441. case S[J] of
  442. '$', '\': begin
  443. Delete(S, I, 1);
  444. Inc(I);
  445. end;
  446. 'g': begin
  447. if (J < Length(S)-1) and (S[J+1] = '<') and (S[J+2] in ['A'..'Z', 'a'..'z', '_']) then begin
  448. // Python-style named group reference \g<name>
  449. J := J+3;
  450. while (J <= Length(S)) and (S[J] in ['0'..'9', 'A'..'Z', 'a'..'z', '_']) do Inc(J);
  451. if (J <= Length(S)) and (S[J] = '>') then begin
  452. N := NamedGroup(Copy(S, I+3, J-I-3));
  453. Inc(J);
  454. Mode := #0;
  455. if N > 0 then ReplaceBackreference(N)
  456. else Delete(S, I, J-I)
  457. end
  458. else I := J
  459. end
  460. else I := I+2;
  461. end;
  462. 'l', 'L', 'u', 'U', 'f', 'F', 'i', 'I': begin
  463. Mode := S[J];
  464. Inc(J);
  465. ProcessBackreference(True, False);
  466. end;
  467. else begin
  468. Mode := #0;
  469. ProcessBackreference(False, False);
  470. end;
  471. end;
  472. end;
  473. '$': begin
  474. J := I + 1;
  475. Assert(J <= Length(S), 'CHECK: We let I stop one character before the end, so J cannot point beyond the end of the PCREString here');
  476. if S[J] = '$' then begin
  477. Delete(S, J, 1);
  478. Inc(I);
  479. end
  480. else begin
  481. Mode := #0;
  482. ProcessBackreference(False, True);
  483. end
  484. end;
  485. else Inc(I)
  486. end
  487. end;
  488. Result := S
  489. end;
  490. constructor TPerlRegEx.Create;
  491. begin
  492. inherited Create;
  493. FState := [preNotEmpty];
  494. chartable := pcre_maketables;
  495. {$IFDEF UNICODE}
  496. pcreOptions := PCRE_UTF8 or PCRE_NEWLINE_ANY;
  497. {$ELSE}
  498. pcreOptions := PCRE_NEWLINE_ANY;
  499. {$ENDIF}
  500. end;
  501. destructor TPerlRegEx.Destroy;
  502. begin
  503. pcre_dispose(pattern, hints, chartable);
  504. inherited Destroy;
  505. end;
  506. class function TPerlRegEx.EscapeRegExChars(const S: string): string;
  507. var
  508. I: Integer;
  509. begin
  510. Result := S;
  511. I := Length(Result);
  512. while I > 0 do begin
  513. case Result[I] of
  514. '.', '[', ']', '(', ')', '?', '*', '+', '{', '}', '^', '$', '|', '\':
  515. Insert('\', Result, I);
  516. #0: begin
  517. Result[I] := '0';
  518. Insert('\', Result, I);
  519. end;
  520. end;
  521. Dec(I);
  522. end;
  523. end;
  524. function TPerlRegEx.GetFoundMatch: Boolean;
  525. begin
  526. Result := OffsetCount > 0;
  527. end;
  528. function TPerlRegEx.GetMatchedText: PCREString;
  529. begin
  530. Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
  531. Result := GetGroups(0);
  532. end;
  533. function TPerlRegEx.GetMatchedLength: Integer;
  534. begin
  535. Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
  536. Result := GetGroupLengths(0)
  537. end;
  538. function TPerlRegEx.GetMatchedOffset: Integer;
  539. begin
  540. Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
  541. Result := GetGroupOffsets(0)
  542. end;
  543. function TPerlRegEx.GetGroupCount: Integer;
  544. begin
  545. Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
  546. Result := OffsetCount-1
  547. end;
  548. function TPerlRegEx.GetGroupLengths(Index: Integer): Integer;
  549. begin
  550. Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
  551. Assert((Index >= 0) and (Index <= GroupCount), 'REQUIRE: Index <= GroupCount');
  552. Result := Offsets[Index*2+1]-Offsets[Index*2]
  553. end;
  554. function TPerlRegEx.GetGroupOffsets(Index: Integer): Integer;
  555. begin
  556. Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
  557. Assert((Index >= 0) and (Index <= GroupCount), 'REQUIRE: Index <= GroupCount');
  558. Result := Offsets[Index*2]
  559. end;
  560. function TPerlRegEx.GetGroups(Index: Integer): PCREString;
  561. begin
  562. Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
  563. if Index > GroupCount then Result := ''
  564. else if FHasStoredGroups then Result := FStoredGroups[Index]
  565. else Result := Copy(FSubject, Offsets[Index*2], Offsets[Index*2+1]-Offsets[Index*2]);
  566. end;
  567. function TPerlRegEx.GetSubjectLeft: PCREString;
  568. begin
  569. Result := Copy(Subject, 1, Offsets[0]-1);
  570. end;
  571. function TPerlRegEx.GetSubjectRight: PCREString;
  572. begin
  573. Result := Copy(Subject, Offsets[1], MaxInt);
  574. end;
  575. function TPerlRegEx.Match: Boolean;
  576. var
  577. I, Opts: Integer;
  578. begin
  579. ClearStoredGroups;
  580. if not Compiled then Compile;
  581. if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0;
  582. if preNotEOL in State then Opts := Opts or PCRE_NOTEOL;
  583. if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY;
  584. OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, 0, Opts, @Offsets[0], High(Offsets));
  585. Result := OffsetCount > 0;
  586. // Convert offsets into PCREString indices
  587. if Result then begin
  588. for I := 0 to OffsetCount*2-1 do
  589. Inc(Offsets[I]);
  590. FStart := Offsets[1];
  591. if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position
  592. if Assigned(OnMatch) then OnMatch(Self)
  593. end;
  594. end;
  595. function TPerlRegEx.MatchAgain: Boolean;
  596. var
  597. I, Opts: Integer;
  598. begin
  599. ClearStoredGroups;
  600. if not Compiled then Compile;
  601. if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0;
  602. if preNotEOL in State then Opts := Opts or PCRE_NOTEOL;
  603. if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY;
  604. if FStart-1 > FStop then OffsetCount := -1
  605. else OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, FStart-1, Opts, @Offsets[0], High(Offsets));
  606. Result := OffsetCount > 0;
  607. // Convert offsets into PCREString indices
  608. if Result then begin
  609. for I := 0 to OffsetCount*2-1 do
  610. Inc(Offsets[I]);
  611. FStart := Offsets[1];
  612. if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position
  613. if Assigned(OnMatch) then OnMatch(Self)
  614. end;
  615. end;
  616. function TPerlRegEx.NamedGroup(const Name: PCREString): Integer;
  617. begin
  618. Result := pcre_get_stringnumber(Pattern, PAnsiChar(Name));
  619. end;
  620. function TPerlRegEx.Replace: PCREString;
  621. begin
  622. Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
  623. // Substitute backreferences
  624. Result := ComputeReplacement;
  625. // Allow for just-in-time substitution determination
  626. if Assigned(OnReplace) then OnReplace(Self, Result);
  627. // Perform substitution
  628. Delete(FSubject, MatchedOffset, MatchedLength);
  629. if Result <> '' then Insert(Result, FSubject, MatchedOffset);
  630. FSubjectPChar := PAnsiChar(FSubject);
  631. // Position to continue search
  632. FStart := FStart - MatchedLength + Length(Result);
  633. FStop := FStop - MatchedLength + Length(Result);
  634. // Replacement no longer matches regex, we assume
  635. ClearStoredGroups;
  636. OffsetCount := 0;
  637. end;
  638. function TPerlRegEx.ReplaceAll: Boolean;
  639. begin
  640. if Match then begin
  641. Result := True;
  642. repeat
  643. Replace
  644. until not MatchAgain;
  645. end
  646. else Result := False;
  647. end;
  648. procedure TPerlRegEx.SetOptions(Value: TPerlRegExOptions);
  649. begin
  650. if (FOptions <> Value) then begin
  651. FOptions := Value;
  652. {$IFDEF UNICODE}
  653. pcreOptions := PCRE_UTF8 or PCRE_NEWLINE_ANY;
  654. {$ELSE}
  655. pcreOptions := PCRE_NEWLINE_ANY;
  656. {$ENDIF}
  657. if (preCaseLess in Value) then pcreOptions := pcreOptions or PCRE_CASELESS;
  658. if (preMultiLine in Value) then pcreOptions := pcreOptions or PCRE_MULTILINE;
  659. if (preSingleLine in Value) then pcreOptions := pcreOptions or PCRE_DOTALL;
  660. if (preExtended in Value) then pcreOptions := pcreOptions or PCRE_EXTENDED;
  661. if (preAnchored in Value) then pcreOptions := pcreOptions or PCRE_ANCHORED;
  662. if (preUnGreedy in Value) then pcreOptions := pcreOptions or PCRE_UNGREEDY;
  663. if (preNoAutoCapture in Value) then pcreOptions := pcreOptions or PCRE_NO_AUTO_CAPTURE;
  664. CleanUp
  665. end
  666. end;
  667. procedure TPerlRegEx.SetRegEx(const Value: PCREString);
  668. begin
  669. if FRegEx <> Value then begin
  670. FRegEx := Value;
  671. CleanUp
  672. end
  673. end;
  674. procedure TPerlRegEx.SetStart(const Value: Integer);
  675. begin
  676. if Value < 1 then FStart := 1
  677. else FStart := Value;
  678. // If FStart > Length(Subject), MatchAgain() will simply return False
  679. end;
  680. procedure TPerlRegEx.SetStop(const Value: Integer);
  681. begin
  682. if Value > Length(Subject) then FStop := Length(Subject)
  683. else FStop := Value;
  684. end;
  685. procedure TPerlRegEx.SetSubject(const Value: PCREString);
  686. begin
  687. FSubject := Value;
  688. FSubjectPChar := PAnsiChar(Value);
  689. FStart := 1;
  690. FStop := Length(Subject);
  691. if not FHasStoredGroups then OffsetCount := 0;
  692. end;
  693. procedure TPerlRegEx.Split(Strings: TStrings; Limit: Integer);
  694. var
  695. Offset, Count: Integer;
  696. begin
  697. Assert(Strings <> nil, 'REQUIRE: Strings');
  698. if (Limit = 1) or not Match then Strings.Add(Subject)
  699. else begin
  700. Offset := 1;
  701. Count := 1;
  702. repeat
  703. Strings.Add(Copy(Subject, Offset, MatchedOffset - Offset));
  704. Inc(Count);
  705. Offset := MatchedOffset + MatchedLength;
  706. until ((Limit > 1) and (Count >= Limit)) or not MatchAgain;
  707. Strings.Add(Copy(Subject, Offset, MaxInt));
  708. end
  709. end;
  710. procedure TPerlRegEx.SplitCapture(Strings: TStrings; Limit, Offset: Integer);
  711. var
  712. Count: Integer;
  713. bUseOffset : boolean;
  714. iOffset : integer;
  715. begin
  716. Assert(Strings <> nil, 'REQUIRE: Strings');
  717. if (Limit = 1) or not Match then Strings.Add(Subject)
  718. else
  719. begin
  720. bUseOffset := Offset <> 1;
  721. if Offset <> 1 then
  722. Dec(Limit);
  723. iOffset := 1;
  724. Count := 1;
  725. repeat
  726. if bUseOffset then
  727. begin
  728. if MatchedOffset >= Offset then
  729. begin
  730. bUseOffset := False;
  731. Strings.Add(Copy(Subject, 1, MatchedOffset -1));
  732. if Self.GroupCount > 0 then
  733. Strings.Add(Self.Groups[Self.GroupCount]);
  734. end;
  735. end
  736. else
  737. begin
  738. Strings.Add(Copy(Subject, iOffset, MatchedOffset - iOffset));
  739. Inc(Count);
  740. if Self.GroupCount > 0 then
  741. Strings.Add(Self.Groups[Self.GroupCount]);
  742. end;
  743. iOffset := MatchedOffset + MatchedLength;
  744. until ((Limit > 1) and (Count >= Limit)) or not MatchAgain;
  745. Strings.Add(Copy(Subject, iOffset, MaxInt));
  746. end
  747. end;
  748. procedure TPerlRegEx.SplitCapture(Strings: TStrings; Limit: Integer);
  749. begin
  750. SplitCapture(Strings,Limit,1);
  751. end;
  752. procedure TPerlRegEx.StoreGroups;
  753. var
  754. I: Integer;
  755. begin
  756. if OffsetCount > 0 then begin
  757. ClearStoredGroups;
  758. SetLength(FStoredGroups, GroupCount+1);
  759. for I := GroupCount downto 0 do
  760. FStoredGroups[I] := Groups[I];
  761. FHasStoredGroups := True;
  762. end
  763. end;
  764. procedure TPerlRegEx.Study;
  765. var
  766. Error: PAnsiChar;
  767. begin
  768. if not FCompiled then Compile;
  769. Hints := pcre_study(Pattern, 0, @Error);
  770. if Error <> nil then
  771. raise Exception.Create('TPerlRegEx.Study() - Error studying the regex: ' + AnsiString(Error));
  772. FStudied := True
  773. end;
  774. { TPerlRegExList }
  775. function TPerlRegExList.Add(ARegEx: TPerlRegEx): Integer;
  776. begin
  777. Result := FList.Add(ARegEx);
  778. UpdateRegEx(ARegEx);
  779. end;
  780. procedure TPerlRegExList.Clear;
  781. begin
  782. FList.Clear;
  783. end;
  784. constructor TPerlRegExList.Create;
  785. begin
  786. inherited Create;
  787. FList := TList.Create;
  788. end;
  789. procedure TPerlRegExList.Delete(Index: Integer);
  790. begin
  791. FList.Delete(Index);
  792. end;
  793. destructor TPerlRegExList.Destroy;
  794. begin
  795. FList.Free;
  796. inherited
  797. end;
  798. function TPerlRegExList.GetCount: Integer;
  799. begin
  800. Result := FList.Count;
  801. end;
  802. function TPerlRegExList.GetRegEx(Index: Integer): TPerlRegEx;
  803. begin
  804. Result := TPerlRegEx(Pointer(FList[Index]));
  805. end;
  806. function TPerlRegExList.IndexOf(ARegEx: TPerlRegEx): Integer;
  807. begin
  808. Result := FList.IndexOf(ARegEx);
  809. end;
  810. procedure TPerlRegExList.Insert(Index: Integer; ARegEx: TPerlRegEx);
  811. begin
  812. FList.Insert(Index, ARegEx);
  813. UpdateRegEx(ARegEx);
  814. end;
  815. function TPerlRegExList.Match: Boolean;
  816. begin
  817. SetStart(1);
  818. FMatchedRegEx := nil;
  819. Result := MatchAgain;
  820. end;
  821. function TPerlRegExList.MatchAgain: Boolean;
  822. var
  823. I, MatchStart, MatchPos: Integer;
  824. ARegEx: TPerlRegEx;
  825. begin
  826. if FMatchedRegEx <> nil then
  827. MatchStart := FMatchedRegEx.MatchedOffset + FMatchedRegEx.MatchedLength
  828. else
  829. MatchStart := FStart;
  830. FMatchedRegEx := nil;
  831. MatchPos := MaxInt;
  832. for I := 0 to Count-1 do begin
  833. ARegEx := RegEx[I];
  834. if (not ARegEx.FoundMatch) or (ARegEx.MatchedOffset < MatchStart) then begin
  835. ARegEx.Start := MatchStart;
  836. ARegEx.MatchAgain;
  837. end;
  838. if ARegEx.FoundMatch and (ARegEx.MatchedOffset < MatchPos) then begin
  839. MatchPos := ARegEx.MatchedOffset;
  840. FMatchedRegEx := ARegEx;
  841. end;
  842. if MatchPos = MatchStart then Break;
  843. end;
  844. Result := MatchPos < MaxInt;
  845. end;
  846. procedure TPerlRegExList.SetRegEx(Index: Integer; Value: TPerlRegEx);
  847. begin
  848. FList[Index] := Value;
  849. UpdateRegEx(Value);
  850. end;
  851. procedure TPerlRegExList.SetStart(const Value: Integer);
  852. var
  853. I: Integer;
  854. begin
  855. if FStart <> Value then begin
  856. FStart := Value;
  857. for I := Count-1 downto 0 do
  858. RegEx[I].Start := Value;
  859. FMatchedRegEx := nil;
  860. end;
  861. end;
  862. procedure TPerlRegExList.SetStop(const Value: Integer);
  863. var
  864. I: Integer;
  865. begin
  866. if FStop <> Value then begin
  867. FStop := Value;
  868. for I := Count-1 downto 0 do
  869. RegEx[I].Stop := Value;
  870. FMatchedRegEx := nil;
  871. end;
  872. end;
  873. procedure TPerlRegExList.SetSubject(const Value: PCREString);
  874. var
  875. I: Integer;
  876. begin
  877. if FSubject <> Value then begin
  878. FSubject := Value;
  879. for I := Count-1 downto 0 do
  880. RegEx[I].Subject := Value;
  881. FMatchedRegEx := nil;
  882. end;
  883. end;
  884. procedure TPerlRegExList.UpdateRegEx(ARegEx: TPerlRegEx);
  885. begin
  886. ARegEx.Subject := FSubject;
  887. ARegEx.Start := FStart;
  888. end;
  889. end.