{ This file is part of the Free Component Library (FCL) Copyright (c) 2023 The Free Pascal team Delphi-compatible Regular expressions unit. See the file COPYING.FPC, included in this distribution, for details about the copyright. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} { Note that the original Delphi unit (by Jan Goyvaerts) uses PCRE1, but this unit uses PCRE2. The string type depends on how the packages were compiled. } unit System.RegularExpressionsCore; {$MODE OBJFPC} {$H+} interface {.$DEFINE USEWIDESTRING} // uncomment if you want to force widestring... // We cannot detect the char size before the uses clause is parsed, it will return 1, the compiler default. // So we need a define here, maybe a compiler switch is needed to set the default size (-Sw ?) which would allow to set the default type. // The detection here is based on the assumption that the dotted units use widestring... {$IFDEF FPC_DOTTEDUNITS} {$DEFINE USEWIDESTRING} {$ENDIF} uses {$IFDEF FPC_DOTTEDUNITS} System.SysUtils, System.Classes, System.Contnrs, System.CTypes, {$IFNDEF CPUWASM} {$IFNDEF USEWIDESTRING} Api.PCRE2_8 {$ELSE} Api.PCRE2_16 {$ENDIF}, {$ELSE} wasm.pcrebridge, {$ENDIF} System.RegularExpressionsConsts; {$ELSE} SysUtils, Classes, Contnrs,ctypes, {$IFNDEF CPUWASM} {$IFNDEF USEWIDESTRING} libpcre2_8 {$ELSE} libpcre2_16 {$ENDIF}, {$ELSE} wasm.pcrebridge, {$ENDIF} System.RegularExpressionsConsts; {$ENDIF} const MAX_SUBEXPRESSIONS = 99; type {$IFDEF USEWIDESTRING} TREString = UnicodeString; {$ElSE} TREString = AnsiString; {$ENDIF} TREStringDynArray = Array of TREString; TPerlRegExOption = (preCaseLess,preMultiLine,preSingleLine,preExtended,preAnchored,preUnGreedy,preNoAutoCapture, preAllowEmptyClass, preAltBSUX, preAltCircumFlex, preAltVerbNames, preDollarEndOnly, preDupNames, preEndAnchored, preFirstLine, preLiteral, preMatchInvalidUTF, preMatchUnsetBackRef, preNeverBackslashC, preNoAutoPossess, preNoDotStarAnchor, preNoStartOptimize, preNoUTFCheck, preUseOffsetLimit); TPerlRegExOptions = set of TPerlRegExOption; TPerlRegExStateItem = (preNotBOL,preNotEOL,preNotEmpty); TPerlRegExState = set of TPerlRegExStateItem; TPerlRegExReplaceEvent = procedure(Sender: TObject; var ReplaceWith: TREString) of object; { TPerlRegEx } TPerlRegEx = class Private Type TTransformation = (tNone,tLowerCase,tUpperCase,tFirstCap,tInitialCap); TMatchResult = (mrFound,mrNotFound,mrAfterStop); class function TransForm(aTransform: TTransformation; const S: TREString): TREString; private {$IFDEF USEWIDESTRING} FCode : Ppcre2_code_16; {$ELSE} FCode : Ppcre2_code_8; {$ENDIF} FOnMatch: TNotifyEvent; FOnReplace: TPerlRegExReplaceEvent; FOptions: TPerlRegExOptions; FRegEx: TREString; FState: TPerlRegExState; FStart, FStop: Integer; FStudied: Boolean; FResultVector : Psize_t; FResultCount : Cardinal; FMatchData : ppcre2_match_data; FModifiedSubject, FSubject: TREString; FSubjectLength : cuint32; FNameCount : cuint32; FNameTable : PCRE2_SPTR; FNameEntrySize : cuint32; FLastModifiedEnd: SizeInt; FReplacement : TREString; FStoredGroups: array of TREString; FCrLFIsNewLine, FIsUtf : Boolean; Procedure CheckMatch; inline; function DoMatch(Opts: CUInt32): TMatchResult; function GetBackRefIndex(const Ref: TREString; var I: Integer): Integer; function GetCompiled: Boolean; function GetFoundMatch: Boolean; inline; function GetGroupCount: Integer; function GetGroupLengths(aIndex: Integer): Integer; function GetGroupOffsets(aIndex: Integer): Integer; function GetGroups(aIndex: Integer): TREString; function GetMatchedLength: Integer; function GetMatchedOffset: Integer; function GetMatchedText: TREString; function GetModifiedSubject: TREString; function GetNamedGroup(const aName : TREString): TREString; procedure GetNamedGroupInfo; function GetNames(aIndex : Integer): TREString; function GetPCREErrorMsg(ErrorNr: Integer): TREString; function GetResultString(aIndex: Integer): TREString; function GetStart: Integer; function GetStop: Integer; function GetSubject: TREString; function GetSubjectLeft: TREString; function GetSubjectRight: TREString; function MakeOptions(aOptions: TPerlRegExOptions): Integer; procedure SetOptions(aValue: TPerlRegExOptions); procedure SetRegEx(const aValue: TREString); procedure SetReplacement(const aValue: TREString); procedure SetStart(aValue: Integer); procedure SetStop(aValue: Integer); procedure SetSubject(const aValue: TREString); protected procedure FreeCodeData; procedure FreeMatchData; procedure CleanUp; virtual; procedure ClearStoredGroups; function FirstOffset : Cardinal; function FirstLength : Cardinal; public constructor Create; destructor Destroy; override; // Use this to escape special characters. class function EscapeRegExChars(const aString: TREString): TREString; // Compile the regex. procedure Compile; // Study regex (may result in faster execution); procedure Study; // Try to match, starting at beginning. Returns true if a match was found. function Match: Boolean; // Try to match again, starting previous match end. Returns true if a new match was found. function MatchAgain: Boolean; // Replace current match in Subject with ComputeReplacement. Returns computed replacement function Replace: TREString; // Replace all matches in Subject with their ComputeReplacement. Returns true if a match was found. function ReplaceAll: Boolean; // Compute replacement text. function ComputeReplacement: TREString; // Store groups for faster access. procedure StoreGroups; // Index in groups of name. function NamedGroup(const aName: TREString): Integer; // Split subject TREString based on regex. aStrings will contain everything outside the matches. procedure Split(const aStrings: TStrings; aLimit: Integer = 0); // Split subject TREString based on regex. Result will contain everything outside the matches. function Split(aLimit: Integer = 0) : TREStringDynArray; // Split subject TREString based on regex, but include matches in result. procedure SplitCapture(const aStrings: TStrings; aLimit: Integer); overload; // Split subject TREString based on regex, but include matches in result. // if aoffset is > 1 then everything till offset is put in the first TREString. procedure SplitCapture(const aStrings: TStrings; aLimit: Integer; aOffset : Integer); overload; // Same with result in array function SplitCapture(aLimit: Integer; aOffset : Integer) : TREStringDynArray; overload; // Was the regex compiled ? property Compiled: Boolean read GetCompiled; // Match found ? property FoundMatch: Boolean read GetFoundMatch; // Did study ? property Studied: Boolean read FStudied; // Fast access, group 0. property MatchedText: TREString read GetMatchedText; property MatchedLength: Integer read GetMatchedLength; property MatchedOffset: Integer read GetMatchedOffset; // Minimum search position, 1-based. property Start: Integer read GetStart write SetStart; // Maximum search position, 1-based. property Stop: Integer read GetStop write SetStop; property State: TPerlRegExState read FState write FState; // Group count. property GroupCount: Integer read GetGroupCount; // Group Texts. Index 0 - GroupCount. 0 is whole matched text. on original search text. property Groups[aIndex: Integer]: TREString read GetGroups; // Group lengths & Offsets. Index 0 - GroupCount. 0 is whole matched text, on original search text. property GroupLengths[aIndex: Integer]: Integer read GetGroupLengths; property GroupOffsets[aIndex: Integer]: Integer read GetGroupOffsets; // Named access to groups. property NamedGroups[aName : TREString] : TREString Read GetNamedGroup; // Names available in current match. Property NameCount : Cardinal Read FNameCount; Property Names[aIndex : Integer] : TREString Read GetNames; // Subject TREString. Will be modified by replace ! property Subject: TREString read GetModifiedSubject write SetSubject; // Original subject TREString. Not modified by replace ! property OriginalSubject: TREString read FSubject write SetSubject; // Left of original subject. property SubjectLeft: TREString read GetSubjectLeft; // Right of original subject. property SubjectRight: TREString read GetSubjectRight; public // Set options. property Options: TPerlRegExOptions read FOptions write SetOptions; // The regular expression property RegEx: TREString read FRegEx write SetRegEx; // The replacement expression. property Replacement: TREString read FReplacement write SetReplacement; // Called on every match. property OnMatch: TNotifyEvent read FOnMatch write FOnMatch; // Set this to modify the computed replacement text. property OnReplace: TPerlRegExReplaceEvent read FOnReplace write FOnReplace; end; TRegExStudyOption = (preJIT, preJITPartialHard, preJITPartialSoft); TRegExStudyOptions = set of TRegExStudyOption; { TPerlRegExList } TPerlRegExList = class private FMatch: TPerlRegEx; FList : TFPObjectList; FStart : Integer; FStop : Integer; FSubject : TREString; function GetCount: Integer; function GetOwnsRegex: Boolean; function GetRegEx(aIndex: Integer): TPerlRegEx; function GetStart: Integer; function GetStop: Integer; function GetSubject: TREString; procedure SetRegEx(aIndex: Integer; aValue: TPerlRegEx); procedure SetStart(aValue: Integer); procedure SetStop(aValue: Integer); procedure SetSubject(aValue: TREString); protected procedure UpdateRegEx(const aRegEx: TPerlRegEx); public constructor Create(OwnsRegex : Boolean); destructor Destroy; override; public function Add(const aRegEx: TPerlRegEx): Integer; procedure Clear; procedure Delete(aIndex: Integer); function IndexOf(const aRegEx: TPerlRegEx): Integer; procedure Insert(aIndex: Integer; const aRegEx: TPerlRegEx); public function Match: Boolean; function MatchAgain: Boolean; property RegEx[aIndex: Integer]: TPerlRegEx read GetRegEx write SetRegEx; property Count: Integer read GetCount; property Subject: TREString read GetSubject write SetSubject; property Start: Integer read GetStart write SetStart; property Stop: Integer read GetStop write SetStop; property MatchedRegEx: TPerlRegEx read FMatch; Property OwnsRegex : Boolean Read GetOwnsRegex; end; ERegularExpressionError = class(Exception); // Todo: move to strutils ? Function InitialCaps(const S : TREString) : TREString; implementation {$IFNDEF USEWIDESTRING} function GetStrLen(p : PAnsiChar; len : Integer) : AnsiString; var L : Integer; begin Result:=''; L:=StrLen(P); if L>Len then L:=Len; SetLength(Result,L); if L>0 then Move(P^,Result[1],L); end; {$ELSE} function GetStrLen(p : PWideChar; len : Integer) : UnicodeString; var L : Integer; begin Result:=''; L:=StrLen(P); if L>Len then L:=Len; SetLength(Result,L); if Len>0 then Move(P^,Result[1],L*2); end; {$ENDIF} Function InitialCaps(const S : TREString) : TREString; const NonWord = [#0..'&', '(', '*', '+', ',', '-', '.', '?', '<', '[', '{', #$B7]; var L : TREString; Len,Last,I : Integer; Upper : Boolean; begin L:=LowerCase(S); Len:=Length(L); Last:=1; I:=1; Upper:=True; Result:=''; While I<=Len do begin if L[i] in NonWord then Upper:=True else if Upper then begin if I>Last then Result:=Result+Copy(L,Last,I-Last); Result:=Result+UpperCase(L[i]); inc(I); Last:=I; Upper:=False; end; Inc(i); end; Result:=Result+Copy(L,Last,I-Last); end; { TPerlRegEx } function TPerlRegEx.GetFoundMatch: Boolean; begin Result:=FResultCount>0; end; function TPerlRegEx.GetCompiled: Boolean; begin Result:=Assigned(FCode); end; procedure TPerlRegEx.CheckMatch; begin if not FoundMatch then raise ERegularExpressionError.Create(SRegExMatchRequired); end; function TPerlRegEx.GetGroupCount: Integer; begin CheckMatch; Result:=FResultCount-1; end; function TPerlRegEx.GetGroupLengths(aIndex: Integer): Integer; begin CheckMatch; Result:=FResultVector[2*aIndex+1]-FResultVector[2*aIndex]; end; function TPerlRegEx.GetGroupOffsets(aIndex: Integer): Integer; begin CheckMatch; Result:=FResultVector[2*aIndex]+1; end; function TPerlRegEx.GetResultString(aIndex: Integer): TREString; var astart,aLength : Ptrint; begin // Writeln('AIndex ',aIndex,' ',FResultCount); aStart:=FResultVector[2*aIndex]; aLength:=FResultVector[2*aIndex+1]-aStart; inc(aStart); // 1-based Result:=Copy(FSubject,AStart,aLength); end; function TPerlRegEx.GetGroups(aIndex: Integer): TREString; begin CheckMatch; if Length(FStoredGroups)>0 then Result:=FStoredGroups[aIndex] else Result:=GetResultString(aIndex); end; function TPerlRegEx.GetMatchedLength: Integer; begin Result:=GetGroupLengths(0) end; function TPerlRegEx.GetMatchedOffset: Integer; begin Result:=GetGroupOffsets(0); end; function TPerlRegEx.GetMatchedText: TREString; begin Result:=GetResultString(0) end; function TPerlRegEx.GetModifiedSubject: TREString; begin Result:=FModifiedSubject; end; function TPerlRegEx.GetNamedGroup(const aName: TREString): TREString; var Idx : integer; begin Result:=''; Idx:=NamedGroup(aName); if Idx<>-1 then Result:=Groups[Idx]; end; function TPerlRegEx.GetStart: Integer; begin Result:=FStart+1; end; function TPerlRegEx.GetStop: Integer; begin Result:=FStop+1; end; function TPerlRegEx.GetSubject: TREString; begin Result:=FSubject; end; function TPerlRegEx.GetSubjectLeft: TREString; begin // Resultvector is 0 based Result:=Copy(FSubject,1,FResultVector[0]); end; function TPerlRegEx.GetSubjectRight: TREString; var SPos : Integer; begin SPos:=FResultVector[1]; // 0-based Result:=Copy(FSubject,SPos+1,FSubjectLength-SPos); end; procedure TPerlRegEx.SetOptions(aValue: TPerlRegExOptions); begin if FOptions=AValue then Exit; FOptions:=AValue; CleanUp; // Need to reset... end; procedure TPerlRegEx.SetRegEx(const aValue: TREString); begin if FRegEx=AValue then Exit; FRegEx:=aValue; end; procedure TPerlRegEx.SetReplacement(const aValue: TREString); begin FReplacement:=AValue; end; procedure TPerlRegEx.SetStart(aValue: Integer); begin FStart:=aValue-1; end; procedure TPerlRegEx.SetStop(aValue: Integer); begin if FStop=aValue-1 then Exit; FStop:=aValue-1; end; procedure TPerlRegEx.SetSubject(const aValue: TREString); begin FSubject:=aValue; FSubjectLength:=Length(FSubject); FModifiedSubject:=aValue; CleanUp; FStart:=0; FStop:=Length(FSubject); end; procedure TPerlRegEx.CleanUp; begin FreeMatchData; FreeCodeData; ClearStoredGroups; FResultCount:=0; FResultVector:=Nil; FLastModifiedEnd:=0; end; procedure TPerlRegEx.ClearStoredGroups; begin SetLength(FStoredGroups,0); end; function TPerlRegEx.FirstOffset: Cardinal; begin Result:=FResultVector[0]; end; function TPerlRegEx.FirstLength: Cardinal; begin Result:=FResultVector[1]-FResultVector[0]; end; constructor TPerlRegEx.Create; begin if not libpcre28loaded then Loadlibpcre28; end; destructor TPerlRegEx.Destroy; begin inherited Destroy; end; class function TPerlRegEx.EscapeRegExChars(const aString: TREString): TREString; Const NeedEscape = ['\','[',']','^','$','.','|','?','*','+','-','(',')','{','}','&','<','>']; var I : Integer; PSrc,PDest,PStart : PChar; begin Result:=''; SetLength(Result,2*Length(aString)); PSrc:=PChar(aString); PDest:=PChar(Result); PStart:=PDest; for I:=1 to Length(aString) do begin if PSrc^=#0 then begin PDest^:='\'; Inc(PDest); PDest^:='0'; end else if CharInSet(PSrc^,NeedEscape) then begin PDest^:='\'; Inc(PDest); PDest^:=PSrc^; end else PDest^:=PSrc^; Inc(PSrc); Inc(PDest); end; SetLength(Result,(PDest-PStart)); end; function TPerlRegEx.MakeOptions(aOptions: TPerlRegExOptions): Integer; Procedure AddOption(aOpt : TPerlRegExOption; aValue : cuint32); begin if aOpt in AOptions then Result:=Result or aValue; end; begin Result:=PCRE2_NEWLINE_ANY or PCRE2_UTF; AddOption(preCaseLess,PCRE2_CASELESS); AddOption(preMultiLine,PCRE2_MULTILINE); AddOption(preSingleLine,PCRE2_DOTALL); AddOption(preExtended,PCRE2_EXTENDED); AddOption(preAnchored,PCRE2_ANCHORED); AddOption(preUnGreedy,PCRE2_UNGREEDY); AddOption(preNoAutoCapture,PCRE2_NO_AUTO_CAPTURE); AddOption(preAllowEmptyClass,PCRE2_ALLOW_EMPTY_CLASS); AddOption(preAltBSUX,PCRE2_ALT_BSUX); AddOption(preAltCircumFlex,PCRE2_ALT_CIRCUMFLEX); AddOption(preAltVerbNames,PCRE2_ALT_VERBNAMES); AddOption(preDollarEndOnly,PCRE2_DOLLAR_ENDONLY); AddOption(preDupNames,PCRE2_DUPNAMES); AddOption(preEndAnchored,PCRE2_ENDANCHORED); AddOption(preFirstLine,PCRE2_FIRSTLINE); AddOption(preLiteral,PCRE2_LITERAL); AddOption(preMatchInvalidUTF,PCRE2_MATCH_INVALID_UTF); AddOption(preMatchUnsetBackRef,PCRE2_MATCH_UNSET_BACKREF); AddOption(preNeverBackslashC,PCRE2_NEVER_BACKSLASH_C); AddOption(preNoAutoPossess,PCRE2_NO_AUTO_POSSESS); AddOption(preNoDotStarAnchor,PCRE2_NO_DOTSTAR_ANCHOR); AddOption(preNoStartOptimize,PCRE2_NO_START_OPTIMIZE); // maybe we should enable by default ? AddOption(preNoUTFCheck,PCRE2_NO_UTF_CHECK); AddOption(preUseOffsetLimit,PCRE2_USE_OFFSET_LIMIT); // AddOption(preUTF,PCRE2_UTF); end; function TPerlRegEx.GetPCREErrorMsg(ErrorNr: Integer): TREString; var Buffer : Array[0..255] of ansichar; begin pcre2_get_error_message(ErrorNr,@Buffer,SizeOf(Buffer)); Result:=strpas(@Buffer); end; procedure TPerlRegEx.Compile; var ErrorNr: Integer; ErrorPos: Integer; begin if (FRegEx='') then raise ERegularExpressionError.CreateRes(@SRegExMissingExpression); CleanUp; FCode:=pcre2_compile(TPCRE2_SPTR8(FRegEx),Length(FRegEx),MakeOptions(FOptions),@ErrorNr,@ErrorPos,Nil); if (FCode=nil) then raise ERegularExpressionError.CreateFmt(SRegExExpressionError,[ErrorPos+1,GetPCREErrorMsg(ErrorNr)]); FMatchData:=pcre2_match_data_create_from_pattern(FCode,Nil); end; procedure TPerlRegEx.Study; begin end; procedure TPerlRegEx.FreeMatchData; var Data : ppcre2_match_data; begin if FMatchData=Nil then exit; Data:=FMatchData; FMatchData:=Nil; pcre2_match_data_free(Data); FResultVector:=Nil; end; procedure TPerlRegEx.FreeCodeData; var {$IFDEF USEWIDESTRING} Data : Ppcre2_code_16; {$ELSE} Data : Ppcre2_code_8; {$ENDIF} begin if (FCode=Nil) then exit; Data:=FCode; FCode:=Nil; pcre2_code_free(Data); end; procedure TPerlRegEx.GetNamedGroupInfo; begin FNameEntrySize:=0; FNameTable:=Nil; pcre2_pattern_info( FCode, (* the compiled pattern *) PCRE2_INFO_NAMECOUNT, (* get the number of named substrings *) @FNameCount); (* where to put the answer *) if (FNameCount = 0) then Exit; pcre2_pattern_info( FCode, (* the compiled pattern *) PCRE2_INFO_NAMETABLE, (* address of the table *) @FNameTable); (* where to put the answer *) pcre2_pattern_info( FCODE, (* the compiled pattern *) PCRE2_INFO_NAMEENTRYSIZE, (* size of each entry in the table *) @FNameEntrySize); end; function TPerlRegEx.GetNames(aIndex : Integer): TREString; var Ptr : PCRE2_SPTR; I : Integer; begin Ptr:=FNameTable; if (aIndex<0) or (aIndex>FNameCount) then Raise ERegularExpressionError.CreateFmt(SErrInvalidNameIndex,[aIndex,FNameCount]); for i:=0 to aIndex-1 do Inc(Ptr,FNameEntrySize); {$IFDEF USEWIDESTRING} Result:=GetStrLen((Ptr+1),FNameEntrySize-2); {$ELSE} Result:=GetStrLen((Ptr+2),FNameEntrySize-3); {$ENDIF} end; function TPerlRegEx.Match: Boolean; var newline,option_bits : cuint32; begin Result:=False; ClearStoredGroups; if not Compiled then Compile; FMatchData:=pcre2_match_data_create_from_pattern(FCode,Nil); Result:=DoMatch(0)=mrFound; if Result then begin pcre2_pattern_info(FCode,PCRE2_INFO_ALLOPTIONS, @option_bits); FIsUtf:=((option_bits and PCRE2_UTF) <> 0); pcre2_pattern_info(FCode,PCRE2_INFO_NEWLINE,@newline); FCrLFIsNewLine:= (newline=PCRE2_NEWLINE_ANY) or (newline=PCRE2_NEWLINE_CRLF) or (newline=PCRE2_NEWLINE_ANYCRLF); end; end; function TPerlRegEx.DoMatch(Opts : CUInt32): TMatchResult; var len,rc : cInt; S : TREString; begin Result:=mrNotFound; {$IF SIZEOF(CHAR)=2} rc:=pcre2_match_w( {$ELSE} rc:=pcre2_match( {$ENDIF} FCode, (* the compiled pattern *) PChar(FSubject), (* the subject TREString *) FSubjectLength, (* the length of the subject *) FStart, (* start at offset 0 in the subject *) Opts, (* default options *) FMatchData, (* block for storing the result *) Nil); if (rc <= 0) then begin FreeMatchData; FreeCodeData; if (rc=PCRE2_ERROR_NOMATCH) then Exit(mrNotFound) else if (rc = 0) then raise ERegularExpressionError.CreateFmt(SRegExMatchError,[SErrRegexOvectorTooSmall]) else raise ERegularExpressionError.CreateFmt(SRegExMatchError,[GetPCREErrorMsg(rc)]); end; Result:=mrFound; FResultCount:=rc; FResultVector:=pcre2_get_ovector_pointer(FMatchData); if FResultVector[0]>FStop then Exit(mrAfterStop); {For i:=0 to FResultCount-1 do Writeln(I,': ',FResultVector[2*I],' - ',FResultVector[2*I+1]);} if (FResultVector[0]>FResultVector[1]) then begin Len:=integer(FResultVector[0]-FResultVector[1]); S:=Copy(FSubject,FResultVector[1],Len); FreeMatchData; FreeCodeData; raise ERegularExpressionError.CreateFmt(SRegExMatcStartAfterEnd,[S]); end; // Next should start after current FStart:=FResultVector[1]; GetNamedGroupInfo; if Assigned(OnMatch) then OnMatch(Self); end; function TPerlRegEx.MatchAgain: Boolean; var StartChar,Opts : cuint32; begin Result:=False; Opts:=0; // Special case, empty TREString. if (FResultVector[0]=FResultVector[1]) then begin if (FResultVector[0]>=FSubjectLength) then Exit; Opts:=PCRE2_NOTEMPTY_ATSTART or PCRE2_ANCHORED; end else begin // Check whether start empty Startchar:=pcre2_get_startchar(FMatchData); if (FStart<=Startchar) then begin (* Reached end of subject. *) if (startchar>=FSubjectLength) then Exit; (* Advance by one character. *) FStart:=StartChar+1; (* If UTF-8, it may be more than one code unit. *) if FIsUtf then begin While (FStart$80) then Exit; Inc(FStart); end; end; end; end; // If we're behind stop, exit at once. Case DoMatch(Opts) of mrAfterStop : Exit(False); mrNotFound : Result:=False; mrFound: Result:=True; end; (* This time, a result of NOMATCH isn't an error. If the value in 'options' is zero, it just means we have found all possible matches, so the loop ends. Otherwise, it means we have failed to find a non-empty-TREString match at a point where there was a previous empty-TREString match. In this case, we do what Perl does: advance the matching position by one character, and continue. We do this by setting the 'end of previous match' offset, because that is picked up at the top of the loop as the point at which to start again. There are two complications: (a) When CRLF is a valid newline sequence, and the current position is just before it, advance by an extra byte. (b) Otherwise we must ensure that we skip an entire UTF character if we are in UTF mode. *) While not Result do begin if Opts=0 then Break; FResultVector[1]:=FStart+1; (* Advance one code unit *) if FCrLFIsNewLine and (* If CRLF is a newline & *) (FStart $80) then break; inc(FResultVector[1]); end; end; Case DoMatch(Opts) of mrAfterStop : begin Result:=False; Break; end; mrNotFound : Result:=False; mrFound: Result:=True; end; end; end; function TPerlRegEx.Replace: TREString; var NewSubject,Tmp : TREString; begin CheckMatch; Result:=ComputeReplacement; if Assigned(OnReplace) then OnReplace(Self, Result); Tmp:=Result; if FLastModifiedEnd=0 then FLastModifiedEnd:=GetMatchedOffset-1; NewSubject:=Copy(FModifiedSubject,1,FLastModifiedEnd)+Tmp; FLastModifiedEnd:=Length(NewSubject)+1; tmp:=GetSubjectRight; FModifiedSubject:=NewSubject+tmp; ClearStoredGroups; end; function TPerlRegEx.ReplaceAll: Boolean; begin Result:=Match; if Not Result then exit; repeat Replace; until not MatchAgain; end; function IsAlphaAndUnderline(const C: Char): Boolean; Const allowed = ['A'..'Z', 'a'..'z', '_']; begin Result:=CharInSet(C,Allowed); end; function IsNumeric(const C: Char): Boolean; Const allowed = ['0'..'9']; begin Result:=CharInSet(C,Allowed); end; { Return values: >=0 : group number. -1 : whole subject. -2 : Left of match. -3 : Right of match. -99 : invalid. On return, I is the index of the next character to process. } function TPerlRegEx.GetBackRefIndex(const Ref: TREString; var I: Integer): Integer; var Len,P,N,Group : Integer; begin Len:=Length(Ref); Group:=-99; Case Ref[I] of '0'..'9': begin Group:=Ord(Ref[i])-Ord('0'); Inc(I); // Only consume as much integers as there are groups. // So if there are 15 groups then $16 -> $1 + literal 6. While (I<=Len) and (Ref[i] in ['0'..'9']) do begin N:=(Group*10)+Ord(Ref[i])-Ord('0'); if N>GroupCount then Break; Group:=N; Inc(I); end; end; '{': begin Inc(I); if (Ref[I] in ['0'..'9']) then // \{123} begin Group:=0; while (ILen) or (Ref[I]<>'}') then Group:=-99 else Inc(I); end else // \{named} begin P:=I; while (ILen) or (Ref[I]<>'}') then Group:=-99 else begin Group:=NamedGroup(Copy(Ref,P,I-P)); if Group=-1 then group:=-99; Inc(I); end end; end; '_': // Whole subject begin Group:=-1; Inc(I); end; '&': // \& or $& (whole regex match) begin Group:=0; Inc(I); end; '+': // Last group begin Group:=GroupCount; Inc(I); end; '`': // Subject to left of match. begin Group:=-2; inc(I); end; #39: // Subject to right of match. begin Group:=-3; inc(I); end end; Result:=Group; end; class function TPerlRegEx.TransForm(aTransform: TTransformation; const S: TREString): TREString; begin Case aTransform of tFirstCap : Result:=UpperCase(Copy(S,1,1))+LowerCase(Copy(S,2,Length(S)-1)); tInitialCap : Result:=InitialCaps(S); tUpperCase : Result:=UpperCase(S); tLowerCase : Result:=LowerCase(S); else Result:=S; end; end; function TPerlRegEx.ComputeReplacement: TREString; var Res : TREString; Len : Integer; Procedure AddToResult(aStart,aNext : Integer); inline; begin Res:=Res+Copy(FReplacement,aStart,aNext-aStart); end; Procedure AddNamedGroup(const aName : TREString); inline; begin Res:=Res+NamedGroups[aName]; end; Function AddBackRef(aTransform : TTransformation; I : Integer) : Integer; var P,N,Group : Integer; begin Group:=GetBackRefIndex(FReplacement,I); Case Group of -99 : ; // invalid -1 : Res:=Res+TransForm(aTransform,FSubject); -2 : Res:=Res+TransForm(aTransform,SubjectLeft); -3 : Res:=Res+TransForm(aTransform,SubjectRight); else if Group<=GroupCount then Res:=Res+TransForm(aTransform,Groups[Group]); end; Result:=I; end; var I, P, Last : Integer; updatelast : boolean; begin Len:=Length(FReplacement); if Len=0 then Exit(''); I:=1; Last:=1; while I<=Len do begin case FReplacement[I] of '\': begin if (I=Len) then raise ERegularExpressionError.CreateFmt(SRegExIndexOutOfBounds,[I]); AddToResult(Last,I); Inc(I); UpdateLast:=True; case FReplacement[I] of '$', '\': begin Inc(I); AddToResult(I-1,I); end; 'g': begin if (I+2 if (I<=Len) and (FReplacement[I]='>') then begin AddNamedGroup(Copy(FReplaceMent,P,I-P)); Inc(I); Last:=I; end else begin I:=I+2; // Skip everything. UpdateLast:=False end; end else UpdateLast:=False; end; 'l','L' : I:=AddBackRef(tLowerCase,I); 'u','U' : I:=AddBackRef(tLowerCase,I); 'f','F' : I:=AddBackRef(tFirstCap,I); 'i','I' : I:=AddBackRef(tInitialCap,I); else I:=AddBackRef(tNone,I); end; if UpdateLast then Last:=I; end; '$': begin if I=Len then raise ERegularExpressionError.CreateFmt(SRegExIndexOutOfBounds,[I]); AddToResult(Last,I); Inc(I); if FReplacement[I]='$' then begin AddToResult(Last,I); Inc(I); end else I:=AddBackRef(tNone,I); Last:=I; end; else // Case Inc(I); end; end; if I>Last then AddToResult(Last,I); Result:=Res; end; procedure TPerlRegEx.StoreGroups; var I : Integer; begin CheckMatch; SetLength(FStoredGroups,GroupCount+1); For I:=0 to GroupCount do FStoredGroups[i]:=GetResultString(I); end; function TPerlRegEx.NamedGroup(const aName: TREString): Integer; var Ptr : PCRE2_SPTR; N,I : Integer; tblName : TREString; begin Ptr:=FNameTable; for i:=0 to FNameCount-1 do begin {$IFDEF USEWIDESTRING} n:=ord(ptr[0]); tblName:=GetStrLen((Ptr+1),FNameEntrySize-2); {$ELSE} n:=(ord(ptr[0]) shl 8) or ord(ptr[1]); tblName:=GetStrLen((Ptr+2),FNameEntrySize-3); {$ENDIF} if SameText(TblName,aName) then Exit(n); Inc(Ptr,FNameEntrySize); end ; Result:=-1; end; procedure TPerlRegEx.Split(const aStrings: TStrings; aLimit: Integer); var NewStart,LastEnd,Matches: Integer; begin if Not Assigned(aStrings) then raise ERegularExpressionError.Create(SRegExStringsRequired); if (aLimit=1) or not Match then begin aStrings.Add(Subject); Exit; end; LastEnd:=0; // Last match pos Matches:=1; repeat NewStart:=FirstOffset; // Start of current match aStrings.Add(Copy(Subject,LastEnd+1,NewStart-LastEnd)); // Copy everything since last match. Inc(Matches); LastEnd:=NewStart+MatchedLength; // update last match pos. until ((aLimit>1) and (Matches>=aLimit)) or not MatchAgain; aStrings.Add(TREString(Copy(FSubject,LastEnd+1,FSubjectLength -LastEnd))); end; function TPerlRegEx.Split(aLimit: Integer): TREStringDynArray; var L: TStrings; I : integer; begin L:=TStringList.Create; try Split(L,aLimit); // We cannot use L.ToStringArray, because the string type may differ :/ SetLength(Result,L.Count); For I:=0 to L.Count-1 do Result[I]:=L[I]; finally L.Free; end; end; procedure TPerlRegEx.SplitCapture(const aStrings: TStrings; aLimit: Integer); begin SplitCapture(aStrings,aLimit,1); end; procedure TPerlRegEx.SplitCapture(const aStrings: TStrings; aLimit: Integer; aOffset: Integer); var NewStart,LastEnd,Matches: Integer; DoCopy : Boolean; begin if Not Assigned(aStrings) then raise ERegularExpressionError.Create(SRegExStringsRequired); if (aLimit=1) or not Match then begin aStrings.Add(Subject); Exit; end; Dec(aOffset); if (aOffset>0) then Dec(aLimit); LastEnd:=0; // Last match pos Matches:=1; repeat NewStart:=FirstOffset; // Start of current match DoCopy:=(NewStart>aOffset); if DoCopy then begin aStrings.Add(Copy(Subject,LastEnd+1,NewStart-LastEnd)); // Copy everything since last match. if GroupCount > 0 then aStrings.Add(Groups[GroupCount]); Inc(Matches); LastEnd:=NewStart+MatchedLength; // update last match pos. end; until ((aLimit>1) and (Matches>=aLimit)) or not MatchAgain; aStrings.Add(TREString(Copy(FSubject,LastEnd+1,FSubjectLength-LastEnd))); end; function TPerlRegEx.SplitCapture(aLimit: Integer; aOffset: Integer): TREStringDynArray; var L: TStrings; I : integer; begin L:=TStringList.Create; try SplitCapture(L,aLimit,aOffset); // We cannot use L.ToStringArray, because the string type may differ :/ SetLength(Result,L.Count); For I:=0 to L.Count-1 do Result[I]:=L[I]; finally L.Free; end; end; { TPerlRegExList } function TPerlRegExList.GetCount: Integer; begin Result:=FList.Count; end; function TPerlRegExList.GetOwnsRegex: Boolean; begin Result:=FList.OwnsObjects; end; function TPerlRegExList.GetRegEx(aIndex: Integer): TPerlRegEx; begin Result:=TPerlRegEx(Flist[aIndex]) end; function TPerlRegExList.GetStart: Integer; begin Result:=FStart; end; function TPerlRegExList.GetStop: Integer; begin Result:=FStop; end; function TPerlRegExList.GetSubject: TREString; begin Result:=FSubject; end; procedure TPerlRegExList.SetRegEx(aIndex: Integer; aValue: TPerlRegEx); begin FList[aIndex]:=aValue; end; procedure TPerlRegExList.SetStart(AValue: Integer); var I : Integer; begin if AValue=FStart then exit; FStart:=aValue; For I:=0 to Count-1 do RegEx[I].Start:=aValue; end; procedure TPerlRegExList.SetStop(AValue: Integer); var I : Integer; begin if AValue=FStart then exit; FStop:=aValue; For I:=0 to Count-1 do RegEx[I].Stop:=aValue; end; procedure TPerlRegExList.SetSubject(aValue: TREString); var I: Integer; begin if aValue=FSUbject then exit; FSubject:=aValue; for I:=Count-1 downto 0 do RegEx[I].Subject:=Subject; FMatch:=nil; end; procedure TPerlRegExList.UpdateRegEx(const aRegEx: TPerlRegEx); begin aRegEx.Subject:=FSubject; ARegEx.Start:=FStart; ARegEx.Stop:=FStop; end; constructor TPerlRegExList.Create(OwnsRegex: Boolean); begin FList:=TFPObjectList.Create(OwnsRegex); end; destructor TPerlRegExList.Destroy; begin FreeAndNil(FList); inherited Destroy; end; function TPerlRegExList.Add(const aRegEx: TPerlRegEx): Integer; begin Result:=FList.Add(aRegEx); UpdateRegEx(aRegEx); end; procedure TPerlRegExList.Clear; begin FList.Clear; end; procedure TPerlRegExList.Delete(aIndex: Integer); begin FList.Delete(aIndex); end; function TPerlRegExList.IndexOf(const aRegEx: TPerlRegEx): Integer; begin Result:=FList.IndexOf(aRegex); end; procedure TPerlRegExList.Insert(aIndex: Integer; const aRegEx: TPerlRegEx); begin FList.Insert(aIndex,aRegex); end; function TPerlRegExList.Match: Boolean; begin SetStart(1); FMatch:=nil; Result:=MatchAgain; end; function TPerlRegExList.MatchAgain: Boolean; var PRE : TPerlRegEx; I,StartAt,Current: Integer; begin // Determine start position if not Assigned(FMatch) then StartAt:=Start else With FMatch do StartAt:=0; // MVC todo {InternalGetMatchedOffset+InternalGetMatchedLength}; FMatch:=nil; Current:=-1; // Check all regexes for new closest match. I:=0; While (IStartAt) do begin PRE:=RegEx[I]; // Should we search this regex again ? if (not PRE.FoundMatch) or (PRE.FirstOffset-1; end; end.