123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963 |
- {**************************************************************************************************}
- { }
- { Perl Regular Expressions VCL component }
- { }
- { The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); }
- { you may not use this file except in compliance with the License. You may obtain a copy of the }
- { License at http://www.mozilla.org/MPL/ }
- { }
- { Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF }
- { ANY KIND, either express or implied. See the License for the specific language governing rights }
- { and limitations under the License. }
- { }
- { The Original Code is PerlRegEx.pas. }
- { }
- { The Initial Developer of the Original Code is Jan Goyvaerts. }
- { Portions created by Jan Goyvaerts are Copyright (C) 1999, 2005, 2008, 2010 Jan Goyvaerts. }
- { All rights reserved. }
- { }
- { Design & implementation, by Jan Goyvaerts, 1999, 2005, 2008, 2010 }
- { }
- { TPerlRegEx is available at http://www.regular-expressions.info/delphi.html }
- { }
- {**************************************************************************************************}
- unit PerlRegEx;
- interface
- uses
- Windows, Messages, SysUtils, Classes,
- pcre;
- type
- TPerlRegExOptions = set of (
- preCaseLess, // /i -> Case insensitive
- preMultiLine, // /m -> ^ and $ also match before/after a newline, not just at the beginning and the end of the string
- preSingleLine, // /s -> Dot matches any character, including \n (newline). Otherwise, it matches anything except \n
- preExtended, // /x -> Allow regex to contain extra whitespace, newlines and Perl-style comments, all of which will be filtered out
- preAnchored, // /A -> Successful match can only occur at the start of the subject or right after the previous match
- preUnGreedy, // Repeat operators (+, *, ?) are not greedy by default (i.e. they try to match the minimum number of characters instead of the maximum)
- preNoAutoCapture // (group) is a non-capturing group; only named groups capture
- );
- type
- TPerlRegExState = set of (
- preNotBOL, // Not Beginning Of Line: ^ does not match at the start of Subject
- preNotEOL, // Not End Of Line: $ does not match at the end of Subject
- preNotEmpty // Empty matches not allowed
- );
- const
- // Maximum number of subexpressions (backreferences)
- // Subexpressions are created by placing round brackets in the regex, and are referenced by \1, \2, ...
- // In Perl, they are available as $1, $2, ... after the regex matched; with TPerlRegEx, use the Subexpressions property
- // You can also insert \1, \2, ... in the replacement string; \0 is the complete matched expression
- MAX_SUBEXPRESSIONS = 99;
- {$IFDEF UNICODE}
- // All implicit string casts have been verified to be correct
- {$WARN IMPLICIT_STRING_CAST OFF}
- // Use UTF-8 in Delphi 2009 and later, so Unicode strings are handled correctly.
- // PCRE does not support UTF-16
- type
- PCREString = UTF8String;
- {$ELSE UNICODE}
- // Use AnsiString in Delphi 2007 and earlier
- type
- PCREString = AnsiString;
- {$ENDIF UNICODE}
- type
- TPerlRegExReplaceEvent = procedure(Sender: TObject; var ReplaceWith: PCREString) of object;
- type
- TPerlRegEx = class
- private // *** Property storage, getters and setters
- FCompiled, FStudied: Boolean;
- FOptions: TPerlRegExOptions;
- FState: TPerlRegExState;
- FRegEx, FReplacement, FSubject: PCREString;
- FStart, FStop: Integer;
- FOnMatch: TNotifyEvent;
- FOnReplace: TPerlRegExReplaceEvent;
- function GetMatchedText: PCREString;
- function GetMatchedLength: Integer;
- function GetMatchedOffset: Integer;
- procedure SetOptions(Value: TPerlRegExOptions);
- procedure SetRegEx(const Value: PCREString);
- function GetGroupCount: Integer;
- function GetGroups(Index: Integer): PCREString;
- function GetGroupLengths(Index: Integer): Integer;
- function GetGroupOffsets(Index: Integer): Integer;
- procedure SetSubject(const Value: PCREString);
- procedure SetStart(const Value: Integer);
- procedure SetStop(const Value: Integer);
- function GetFoundMatch: Boolean;
- private // *** Variables used by PCRE
- Offsets: array[0..(MAX_SUBEXPRESSIONS+1)*3] of Integer;
- OffsetCount: Integer;
- pcreOptions: Integer;
- pattern, hints, chartable: Pointer;
- FSubjectPChar: PAnsiChar;
- FHasStoredGroups: Boolean;
- FStoredGroups: array of PCREString;
- function GetSubjectLeft: PCREString;
- function GetSubjectRight: PCREString;
- protected
- procedure CleanUp;
- // Dispose off whatever we created, so we can start over. Called automatically when needed, so it is not made public
- procedure ClearStoredGroups;
- public
- constructor Create;
- // Come to life
- destructor Destroy; override;
- // Clean up after ourselves
- class function EscapeRegExChars(const S: string): string;
- // Escapes regex characters in S so that the regex engine can be used to match S as plain text
- procedure Compile;
- // Compile the regex. Called automatically by Match
- procedure Study;
- // Study the regex. Studying takes time, but will make the execution of the regex a lot faster.
- // Call study if you will be using the same regex many times
- function Match: Boolean;
- // Attempt to match the regex, starting the attempt from the beginning of Subject
- function MatchAgain: Boolean;
- // Attempt to match the regex to the remainder of Subject after the previous match (as indicated by Start)
- function Replace: PCREString;
- // Replace matched expression in Subject with ComputeReplacement. Returns the actual replacement text from ComputeReplacement
- function ReplaceAll: Boolean;
- // Repeat MatchAgain and Replace until you drop. Returns True if anything was replaced at all.
- function ComputeReplacement: PCREString;
- // Returns Replacement with backreferences filled in
- procedure StoreGroups;
- // Stores duplicates of Groups[] so they and ComputeReplacement will still return the proper strings
- // even if FSubject is changed or cleared
- function NamedGroup(const Name: PCREString): Integer;
- // Returns the index of the named group Name
- procedure Split(Strings: TStrings; Limit: Integer);
- // Split Subject along regex matches. Capturing groups are ignored.
- procedure SplitCapture(Strings: TStrings; Limit: Integer); overload;
- procedure SplitCapture(Strings: TStrings; Limit: Integer; Offset: Integer); overload;
- // Split Subject along regex matches. Capturing groups are added to Strings as well.
- property Compiled: Boolean read FCompiled;
- // True if the RegEx has already been compiled.
- property FoundMatch: Boolean read GetFoundMatch;
- // Returns True when Matched* and Group* indicate a match
- property Studied: Boolean read FStudied;
- // True if the RegEx has already been studied
- property MatchedText: PCREString read GetMatchedText;
- // The matched text
- property MatchedLength: Integer read GetMatchedLength;
- // Length of the matched text
- property MatchedOffset: Integer read GetMatchedOffset;
- // Character offset in the Subject string at which MatchedText starts
- property Start: Integer read FStart write SetStart;
- // Starting position in Subject from which MatchAgain begins
- property Stop: Integer read FStop write SetStop;
- // Last character in Subject that Match and MatchAgain search through
- property State: TPerlRegExState read FState write FState;
- // State of Subject
- property GroupCount: Integer read GetGroupCount;
- // Number of matched capturing groups
- property Groups[Index: Integer]: PCREString read GetGroups;
- // Text matched by capturing groups
- property GroupLengths[Index: Integer]: Integer read GetGroupLengths;
- // Lengths of the text matched by capturing groups
- property GroupOffsets[Index: Integer]: Integer read GetGroupOffsets;
- // Character offsets in Subject at which the capturing group matches start
- property Subject: PCREString read FSubject write SetSubject;
- // The string on which Match() will try to match RegEx
- property SubjectLeft: PCREString read GetSubjectLeft;
- // Part of the subject to the left of the match
- property SubjectRight: PCREString read GetSubjectRight;
- // Part of the subject to the right of the match
- public
- property Options: TPerlRegExOptions read FOptions write SetOptions;
- // Options
- property RegEx: PCREString read FRegEx write SetRegEx;
- // The regular expression to be matched
- property Replacement: PCREString read FReplacement write FReplacement;
- // Text to replace matched expression with. \number and $number backreferences will be substituted with Groups
- // TPerlRegEx supports the "JGsoft" replacement text flavor as explained at http://www.regular-expressions.info/refreplace.html
- property OnMatch: TNotifyEvent read FOnMatch write FOnMatch;
- // Triggered by Match and MatchAgain after a successful match
- property OnReplace: TPerlRegExReplaceEvent read FOnReplace write FOnReplace;
- // Triggered by Replace and ReplaceAll just before the replacement is done, allowing you to determine the new PCREString
- end;
- {
- You can add TPerlRegEx instances to a TPerlRegExList to match them all together on the same subject,
- as if they were one regex regex1|regex2|regex3|...
- TPerlRegExList does not own the TPerlRegEx components, just like a TList
- If a TPerlRegEx has been added to a TPerlRegExList, it should not be used in any other situation
- until it is removed from the list
- }
- type
- TPerlRegExList = class
- private
- FList: TList;
- FSubject: PCREString;
- FMatchedRegEx: TPerlRegEx;
- FStart, FStop: Integer;
- function GetRegEx(Index: Integer): TPerlRegEx;
- procedure SetRegEx(Index: Integer; Value: TPerlRegEx);
- procedure SetSubject(const Value: PCREString);
- procedure SetStart(const Value: Integer);
- procedure SetStop(const Value: Integer);
- function GetCount: Integer;
- protected
- procedure UpdateRegEx(ARegEx: TPerlRegEx);
- public
- constructor Create;
- destructor Destroy; override;
- public
- function Add(ARegEx: TPerlRegEx): Integer;
- procedure Clear;
- procedure Delete(Index: Integer);
- function IndexOf(ARegEx: TPerlRegEx): Integer;
- procedure Insert(Index: Integer; ARegEx: TPerlRegEx);
- public
- function Match: Boolean;
- function MatchAgain: Boolean;
- property RegEx[Index: Integer]: TPerlRegEx read GetRegEx write SetRegEx;
- property Count: Integer read GetCount;
- property Subject: PCREString read FSubject write SetSubject;
- property Start: Integer read FStart write SetStart;
- property Stop: Integer read FStop write SetStop;
- property MatchedRegEx: TPerlRegEx read FMatchedRegEx;
- end;
- implementation
- { ********* Unit support routines ********* }
- function FirstCap(const S: string): string;
- begin
- if S = '' then Result := ''
- else begin
- Result := AnsiLowerCase(S);
- {$IFDEF UNICODE}
- CharUpperBuffW(@Result[1], 1);
- {$ELSE}
- CharUpperBuffA(@Result[1], 1);
- {$ENDIF}
- end
- end;
- function InitialCaps(const S: string): string;
- var
- I: Integer;
- Up: Boolean;
- begin
- Result := AnsiLowerCase(S);
- Up := True;
- {$IFDEF UNICODE}
- for I := 1 to Length(Result) do begin
- case Result[I] of
- #0..'&', '(', '*', '+', ',', '-', '.', '?', '<', '[', '{', #$00B7:
- Up := True
- else
- if Up and (Result[I] <> '''') then begin
- CharUpperBuffW(@Result[I], 1);
- Up := False
- end
- end;
- end;
- {$ELSE UNICODE}
- if SysLocale.FarEast then begin
- I := 1;
- while I <= Length(Result) do begin
- if Result[I] in LeadBytes then begin
- Inc(I, 2)
- end
- else begin
- if Result[I] in [#0..'&', '('..'.', '?', '<', '[', '{'] then Up := True
- else if Up and (Result[I] <> '''') then begin
- CharUpperBuffA(@Result[I], 1);
- Result[I] := UpperCase(Result[I])[1];
- Up := False
- end;
- Inc(I)
- end
- end
- end
- else
- for I := 1 to Length(Result) do begin
- if Result[I] in [#0..'&', '('..'.', '?', '<', '[', '{', #$B7] then Up := True
- else if Up and (Result[I] <> '''') then begin
- CharUpperBuffA(@Result[I], 1);
- Result[I] := AnsiUpperCase(Result[I])[1];
- Up := False
- end
- end;
- {$ENDIF UNICODE}
- end;
- { ********* TPerlRegEx component ********* }
- procedure TPerlRegEx.CleanUp;
- begin
- FCompiled := False; FStudied := False;
- pcre_dispose(pattern, hints, nil);
- pattern := nil;
- hints := nil;
- ClearStoredGroups;
- OffsetCount := 0;
- end;
- procedure TPerlRegEx.ClearStoredGroups;
- begin
- FHasStoredGroups := False;
- FStoredGroups := nil;
- end;
- procedure TPerlRegEx.Compile;
- var
- Error: PAnsiChar;
- ErrorOffset: Integer;
- begin
- if FRegEx = '' then
- raise Exception.Create('TPerlRegEx.Compile() - Please specify a regular expression in RegEx first');
- CleanUp;
- Pattern := pcre_compile(PAnsiChar(FRegEx), pcreOptions, @Error, @ErrorOffset, chartable);
- if Pattern = nil then
- raise Exception.Create(Format('TPerlRegEx.Compile() - Error in regex at offset %d: %s', [ErrorOffset, AnsiString(Error)]));
- FCompiled := True
- end;
- (* Backreference overview:
- Assume there are 13 backreferences:
- Text TPerlRegex .NET Java ECMAScript
- $17 $1 + "7" "$17" $1 + "7" $1 + "7"
- $017 $1 + "7" "$017" $1 + "7" $1 + "7"
- $12 $12 $12 $12 $12
- $012 $1 + "2" $12 $12 $1 + "2"
- ${1}2 $1 + "2" $1 + "2" error "${1}2"
- $$ "$" "$" error "$"
- \$ "$" "\$" "$" "\$"
- *)
- function TPerlRegEx.ComputeReplacement: PCREString;
- var
- Mode: AnsiChar;
- S: PCREString;
- I, J, N: Integer;
- procedure ReplaceBackreference(Number: Integer);
- var
- Backreference: PCREString;
- begin
- Delete(S, I, J-I);
- if Number <= GroupCount then begin
- Backreference := Groups[Number];
- if Backreference <> '' then begin
- // Ignore warnings; converting to UTF-8 does not cause data loss
- case Mode of
- 'L', 'l': Backreference := AnsiLowerCase(Backreference);
- 'U', 'u': Backreference := AnsiUpperCase(Backreference);
- 'F', 'f': Backreference := FirstCap(Backreference);
- 'I', 'i': Backreference := InitialCaps(Backreference);
- end;
- if S <> '' then begin
- Insert(Backreference, S, I);
- I := I + Length(Backreference);
- end
- else begin
- S := Backreference;
- I := MaxInt;
- end
- end;
- end
- end;
- procedure ProcessBackreference(NumberOnly, Dollar: Boolean);
- var
- Number, Number2: Integer;
- Group: PCREString;
- begin
- Number := -1;
- if (J <= Length(S)) and (S[J] in ['0'..'9']) then begin
- // Get the number of the backreference
- Number := Ord(S[J]) - Ord('0');
- Inc(J);
- if (J <= Length(S)) and (S[J] in ['0'..'9']) then begin
- // Expand it to two digits only if that would lead to a valid backreference
- Number2 := Number*10 + Ord(S[J]) - Ord('0');
- if Number2 <= GroupCount then begin
- Number := Number2;
- Inc(J)
- end;
- end;
- end
- else if not NumberOnly then begin
- if Dollar and (J < Length(S)) and (S[J] = '{') then begin
- // Number or name in curly braces
- Inc(J);
- case S[J] of
- '0'..'9': begin
- Number := Ord(S[J]) - Ord('0');
- Inc(J);
- while (J <= Length(S)) and (S[J] in ['0'..'9']) do begin
- Number := Number*10 + Ord(S[J]) - Ord('0');
- Inc(J)
- end;
- end;
- 'A'..'Z', 'a'..'z', '_': begin
- Inc(J);
- while (J <= Length(S)) and (S[J] in ['A'..'Z', 'a'..'z', '0'..'9', '_']) do Inc(J);
- if (J <= Length(S)) and (S[J] = '}') then begin
- Group := Copy(S, I+2, J-I-2);
- Number := NamedGroup(Group);
- end
- end;
- end;
- if (J > Length(S)) or (S[J] <> '}') then Number := -1
- else Inc(J)
- end
- else if Dollar and (S[J] = '_') then begin
- // $_ (whole subject)
- Delete(S, I, J+1-I);
- Insert(Subject, S, I);
- I := I + Length(Subject);
- Exit;
- end
- else case S[J] of
- '&': begin
- // \& or $& (whole regex match)
- Number := 0;
- Inc(J);
- end;
- '+': begin
- // \+ or $+ (highest-numbered participating group)
- Number := GroupCount;
- Inc(J);
- end;
- '`': begin
- // \` or $` (backtick; subject to the left of the match)
- Delete(S, I, J+1-I);
- Insert(SubjectLeft, S, I);
- I := I + Offsets[0] - 1;
- Exit;
- end;
- '''': begin
- // \' or $' (straight quote; subject to the right of the match)
- Delete(S, I, J+1-I);
- Insert(SubjectRight, S, I);
- I := I + Length(Subject) - Offsets[1];
- Exit;
- end
- end;
- end;
- if Number >= 0 then ReplaceBackreference(Number)
- else Inc(I)
- end;
- begin
- S := FReplacement;
- I := 1;
- while I < Length(S) do begin
- case S[I] of
- '\': begin
- J := I + 1;
- Assert(J <= Length(S), 'CHECK: We let I stop one character before the end, so J cannot point beyond the end of the PCREString here');
- case S[J] of
- '$', '\': begin
- Delete(S, I, 1);
- Inc(I);
- end;
- 'g': begin
- if (J < Length(S)-1) and (S[J+1] = '<') and (S[J+2] in ['A'..'Z', 'a'..'z', '_']) then begin
- // Python-style named group reference \g<name>
- J := J+3;
- while (J <= Length(S)) and (S[J] in ['0'..'9', 'A'..'Z', 'a'..'z', '_']) do Inc(J);
- if (J <= Length(S)) and (S[J] = '>') then begin
- N := NamedGroup(Copy(S, I+3, J-I-3));
- Inc(J);
- Mode := #0;
- if N > 0 then ReplaceBackreference(N)
- else Delete(S, I, J-I)
- end
- else I := J
- end
- else I := I+2;
- end;
- 'l', 'L', 'u', 'U', 'f', 'F', 'i', 'I': begin
- Mode := S[J];
- Inc(J);
- ProcessBackreference(True, False);
- end;
- else begin
- Mode := #0;
- ProcessBackreference(False, False);
- end;
- end;
- end;
- '$': begin
- J := I + 1;
- Assert(J <= Length(S), 'CHECK: We let I stop one character before the end, so J cannot point beyond the end of the PCREString here');
- if S[J] = '$' then begin
- Delete(S, J, 1);
- Inc(I);
- end
- else begin
- Mode := #0;
- ProcessBackreference(False, True);
- end
- end;
- else Inc(I)
- end
- end;
- Result := S
- end;
- constructor TPerlRegEx.Create;
- begin
- inherited Create;
- FState := [preNotEmpty];
- chartable := pcre_maketables;
- {$IFDEF UNICODE}
- pcreOptions := PCRE_UTF8 or PCRE_NEWLINE_ANY;
- {$ELSE}
- pcreOptions := PCRE_NEWLINE_ANY;
- {$ENDIF}
- end;
- destructor TPerlRegEx.Destroy;
- begin
- pcre_dispose(pattern, hints, chartable);
- inherited Destroy;
- end;
- class function TPerlRegEx.EscapeRegExChars(const S: string): string;
- var
- I: Integer;
- begin
- Result := S;
- I := Length(Result);
- while I > 0 do begin
- case Result[I] of
- '.', '[', ']', '(', ')', '?', '*', '+', '{', '}', '^', '$', '|', '\':
- Insert('\', Result, I);
- #0: begin
- Result[I] := '0';
- Insert('\', Result, I);
- end;
- end;
- Dec(I);
- end;
- end;
- function TPerlRegEx.GetFoundMatch: Boolean;
- begin
- Result := OffsetCount > 0;
- end;
- function TPerlRegEx.GetMatchedText: PCREString;
- begin
- Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
- Result := GetGroups(0);
- end;
- function TPerlRegEx.GetMatchedLength: Integer;
- begin
- Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
- Result := GetGroupLengths(0)
- end;
- function TPerlRegEx.GetMatchedOffset: Integer;
- begin
- Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
- Result := GetGroupOffsets(0)
- end;
- function TPerlRegEx.GetGroupCount: Integer;
- begin
- Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
- Result := OffsetCount-1
- end;
- function TPerlRegEx.GetGroupLengths(Index: Integer): Integer;
- begin
- Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
- Assert((Index >= 0) and (Index <= GroupCount), 'REQUIRE: Index <= GroupCount');
- Result := Offsets[Index*2+1]-Offsets[Index*2]
- end;
- function TPerlRegEx.GetGroupOffsets(Index: Integer): Integer;
- begin
- Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
- Assert((Index >= 0) and (Index <= GroupCount), 'REQUIRE: Index <= GroupCount');
- Result := Offsets[Index*2]
- end;
- function TPerlRegEx.GetGroups(Index: Integer): PCREString;
- begin
- Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
- if Index > GroupCount then Result := ''
- else if FHasStoredGroups then Result := FStoredGroups[Index]
- else Result := Copy(FSubject, Offsets[Index*2], Offsets[Index*2+1]-Offsets[Index*2]);
- end;
- function TPerlRegEx.GetSubjectLeft: PCREString;
- begin
- Result := Copy(Subject, 1, Offsets[0]-1);
- end;
- function TPerlRegEx.GetSubjectRight: PCREString;
- begin
- Result := Copy(Subject, Offsets[1], MaxInt);
- end;
- function TPerlRegEx.Match: Boolean;
- var
- I, Opts: Integer;
- begin
- ClearStoredGroups;
- if not Compiled then Compile;
- if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0;
- if preNotEOL in State then Opts := Opts or PCRE_NOTEOL;
- if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY;
- OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, 0, Opts, @Offsets[0], High(Offsets));
- Result := OffsetCount > 0;
- // Convert offsets into PCREString indices
- if Result then begin
- for I := 0 to OffsetCount*2-1 do
- Inc(Offsets[I]);
- FStart := Offsets[1];
- if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position
- if Assigned(OnMatch) then OnMatch(Self)
- end;
- end;
- function TPerlRegEx.MatchAgain: Boolean;
- var
- I, Opts: Integer;
- begin
- ClearStoredGroups;
- if not Compiled then Compile;
- if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0;
- if preNotEOL in State then Opts := Opts or PCRE_NOTEOL;
- if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY;
- if FStart-1 > FStop then OffsetCount := -1
- else OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, FStart-1, Opts, @Offsets[0], High(Offsets));
- Result := OffsetCount > 0;
- // Convert offsets into PCREString indices
- if Result then begin
- for I := 0 to OffsetCount*2-1 do
- Inc(Offsets[I]);
- FStart := Offsets[1];
- if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position
- if Assigned(OnMatch) then OnMatch(Self)
- end;
- end;
- function TPerlRegEx.NamedGroup(const Name: PCREString): Integer;
- begin
- Result := pcre_get_stringnumber(Pattern, PAnsiChar(Name));
- end;
- function TPerlRegEx.Replace: PCREString;
- begin
- Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
- // Substitute backreferences
- Result := ComputeReplacement;
- // Allow for just-in-time substitution determination
- if Assigned(OnReplace) then OnReplace(Self, Result);
- // Perform substitution
- Delete(FSubject, MatchedOffset, MatchedLength);
- if Result <> '' then Insert(Result, FSubject, MatchedOffset);
- FSubjectPChar := PAnsiChar(FSubject);
- // Position to continue search
- FStart := FStart - MatchedLength + Length(Result);
- FStop := FStop - MatchedLength + Length(Result);
- // Replacement no longer matches regex, we assume
- ClearStoredGroups;
- OffsetCount := 0;
- end;
- function TPerlRegEx.ReplaceAll: Boolean;
- begin
- if Match then begin
- Result := True;
- repeat
- Replace
- until not MatchAgain;
- end
- else Result := False;
- end;
- procedure TPerlRegEx.SetOptions(Value: TPerlRegExOptions);
- begin
- if (FOptions <> Value) then begin
- FOptions := Value;
- {$IFDEF UNICODE}
- pcreOptions := PCRE_UTF8 or PCRE_NEWLINE_ANY;
- {$ELSE}
- pcreOptions := PCRE_NEWLINE_ANY;
- {$ENDIF}
- if (preCaseLess in Value) then pcreOptions := pcreOptions or PCRE_CASELESS;
- if (preMultiLine in Value) then pcreOptions := pcreOptions or PCRE_MULTILINE;
- if (preSingleLine in Value) then pcreOptions := pcreOptions or PCRE_DOTALL;
- if (preExtended in Value) then pcreOptions := pcreOptions or PCRE_EXTENDED;
- if (preAnchored in Value) then pcreOptions := pcreOptions or PCRE_ANCHORED;
- if (preUnGreedy in Value) then pcreOptions := pcreOptions or PCRE_UNGREEDY;
- if (preNoAutoCapture in Value) then pcreOptions := pcreOptions or PCRE_NO_AUTO_CAPTURE;
- CleanUp
- end
- end;
- procedure TPerlRegEx.SetRegEx(const Value: PCREString);
- begin
- if FRegEx <> Value then begin
- FRegEx := Value;
- CleanUp
- end
- end;
- procedure TPerlRegEx.SetStart(const Value: Integer);
- begin
- if Value < 1 then FStart := 1
- else FStart := Value;
- // If FStart > Length(Subject), MatchAgain() will simply return False
- end;
- procedure TPerlRegEx.SetStop(const Value: Integer);
- begin
- if Value > Length(Subject) then FStop := Length(Subject)
- else FStop := Value;
- end;
- procedure TPerlRegEx.SetSubject(const Value: PCREString);
- begin
- FSubject := Value;
- FSubjectPChar := PAnsiChar(Value);
- FStart := 1;
- FStop := Length(Subject);
- if not FHasStoredGroups then OffsetCount := 0;
- end;
- procedure TPerlRegEx.Split(Strings: TStrings; Limit: Integer);
- var
- Offset, Count: Integer;
- begin
- Assert(Strings <> nil, 'REQUIRE: Strings');
- if (Limit = 1) or not Match then Strings.Add(Subject)
- else begin
- Offset := 1;
- Count := 1;
- repeat
- Strings.Add(Copy(Subject, Offset, MatchedOffset - Offset));
- Inc(Count);
- Offset := MatchedOffset + MatchedLength;
- until ((Limit > 1) and (Count >= Limit)) or not MatchAgain;
- Strings.Add(Copy(Subject, Offset, MaxInt));
- end
- end;
- procedure TPerlRegEx.SplitCapture(Strings: TStrings; Limit, Offset: Integer);
- var
- Count: Integer;
- bUseOffset : boolean;
- iOffset : integer;
- begin
- Assert(Strings <> nil, 'REQUIRE: Strings');
- if (Limit = 1) or not Match then Strings.Add(Subject)
- else
- begin
- bUseOffset := Offset <> 1;
- if Offset <> 1 then
- Dec(Limit);
- iOffset := 1;
- Count := 1;
- repeat
- if bUseOffset then
- begin
- if MatchedOffset >= Offset then
- begin
- bUseOffset := False;
- Strings.Add(Copy(Subject, 1, MatchedOffset -1));
- if Self.GroupCount > 0 then
- Strings.Add(Self.Groups[Self.GroupCount]);
- end;
- end
- else
- begin
- Strings.Add(Copy(Subject, iOffset, MatchedOffset - iOffset));
- Inc(Count);
- if Self.GroupCount > 0 then
- Strings.Add(Self.Groups[Self.GroupCount]);
- end;
- iOffset := MatchedOffset + MatchedLength;
- until ((Limit > 1) and (Count >= Limit)) or not MatchAgain;
- Strings.Add(Copy(Subject, iOffset, MaxInt));
- end
- end;
- procedure TPerlRegEx.SplitCapture(Strings: TStrings; Limit: Integer);
- begin
- SplitCapture(Strings,Limit,1);
- end;
- procedure TPerlRegEx.StoreGroups;
- var
- I: Integer;
- begin
- if OffsetCount > 0 then begin
- ClearStoredGroups;
- SetLength(FStoredGroups, GroupCount+1);
- for I := GroupCount downto 0 do
- FStoredGroups[I] := Groups[I];
- FHasStoredGroups := True;
- end
- end;
- procedure TPerlRegEx.Study;
- var
- Error: PAnsiChar;
- begin
- if not FCompiled then Compile;
- Hints := pcre_study(Pattern, 0, @Error);
- if Error <> nil then
- raise Exception.Create('TPerlRegEx.Study() - Error studying the regex: ' + AnsiString(Error));
- FStudied := True
- end;
- { TPerlRegExList }
- function TPerlRegExList.Add(ARegEx: TPerlRegEx): Integer;
- begin
- Result := FList.Add(ARegEx);
- UpdateRegEx(ARegEx);
- end;
- procedure TPerlRegExList.Clear;
- begin
- FList.Clear;
- end;
- constructor TPerlRegExList.Create;
- begin
- inherited Create;
- FList := TList.Create;
- end;
- procedure TPerlRegExList.Delete(Index: Integer);
- begin
- FList.Delete(Index);
- end;
- destructor TPerlRegExList.Destroy;
- begin
- FList.Free;
- inherited
- end;
- function TPerlRegExList.GetCount: Integer;
- begin
- Result := FList.Count;
- end;
- function TPerlRegExList.GetRegEx(Index: Integer): TPerlRegEx;
- begin
- Result := TPerlRegEx(Pointer(FList[Index]));
- end;
- function TPerlRegExList.IndexOf(ARegEx: TPerlRegEx): Integer;
- begin
- Result := FList.IndexOf(ARegEx);
- end;
- procedure TPerlRegExList.Insert(Index: Integer; ARegEx: TPerlRegEx);
- begin
- FList.Insert(Index, ARegEx);
- UpdateRegEx(ARegEx);
- end;
- function TPerlRegExList.Match: Boolean;
- begin
- SetStart(1);
- FMatchedRegEx := nil;
- Result := MatchAgain;
- end;
- function TPerlRegExList.MatchAgain: Boolean;
- var
- I, MatchStart, MatchPos: Integer;
- ARegEx: TPerlRegEx;
- begin
- if FMatchedRegEx <> nil then
- MatchStart := FMatchedRegEx.MatchedOffset + FMatchedRegEx.MatchedLength
- else
- MatchStart := FStart;
- FMatchedRegEx := nil;
- MatchPos := MaxInt;
- for I := 0 to Count-1 do begin
- ARegEx := RegEx[I];
- if (not ARegEx.FoundMatch) or (ARegEx.MatchedOffset < MatchStart) then begin
- ARegEx.Start := MatchStart;
- ARegEx.MatchAgain;
- end;
- if ARegEx.FoundMatch and (ARegEx.MatchedOffset < MatchPos) then begin
- MatchPos := ARegEx.MatchedOffset;
- FMatchedRegEx := ARegEx;
- end;
- if MatchPos = MatchStart then Break;
- end;
- Result := MatchPos < MaxInt;
- end;
- procedure TPerlRegExList.SetRegEx(Index: Integer; Value: TPerlRegEx);
- begin
- FList[Index] := Value;
- UpdateRegEx(Value);
- end;
- procedure TPerlRegExList.SetStart(const Value: Integer);
- var
- I: Integer;
- begin
- if FStart <> Value then begin
- FStart := Value;
- for I := Count-1 downto 0 do
- RegEx[I].Start := Value;
- FMatchedRegEx := nil;
- end;
- end;
- procedure TPerlRegExList.SetStop(const Value: Integer);
- var
- I: Integer;
- begin
- if FStop <> Value then begin
- FStop := Value;
- for I := Count-1 downto 0 do
- RegEx[I].Stop := Value;
- FMatchedRegEx := nil;
- end;
- end;
- procedure TPerlRegExList.SetSubject(const Value: PCREString);
- var
- I: Integer;
- begin
- if FSubject <> Value then begin
- FSubject := Value;
- for I := Count-1 downto 0 do
- RegEx[I].Subject := Value;
- FMatchedRegEx := nil;
- end;
- end;
- procedure TPerlRegExList.UpdateRegEx(ARegEx: TPerlRegEx);
- begin
- ARegEx.Subject := FSubject;
- ARegEx.Start := FStart;
- end;
- end.
|