|
@@ -1,4 +1,4 @@
|
|
|
-unit regexpr;
|
|
|
+unit regexpr;
|
|
|
|
|
|
{
|
|
|
TRegExpr class library
|
|
@@ -69,23 +69,23 @@ interface
|
|
|
{$BOOLEVAL OFF}
|
|
|
{$EXTENDEDSYNTAX ON}
|
|
|
{$LONGSTRINGS ON}
|
|
|
-{ OPTIMIZATION ON} // Handled by (fp)make options
|
|
|
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
|
|
|
{$INLINE ON}
|
|
|
{$DEFINE COMPAT}
|
|
|
// ======== Define options for TRegExpr engine
|
|
|
-{ $DEFINE Unicode} // Use WideChar for characters and UnicodeString/WideString for strings
|
|
|
+{ off $DEFINE UnicodeRE} // Use WideChar for characters and UnicodeString/WideString for strings
|
|
|
{ off $DEFINE UnicodeEx} // Support Unicode >0xFFFF, e.g. emoji, e.g. "." must find 2 WideChars of 1 emoji
|
|
|
{$DEFINE UseWordChars} // Use WordChars property, otherwise fixed list 'a'..'z','A'..'Z','0'..'9','_'
|
|
|
{$DEFINE UseSpaceChars} // Use SpaceChars property, otherwise fixed list
|
|
|
{$DEFINE UseLineSep} // Use LineSeparators property, otherwise fixed line-break chars
|
|
|
+{ off $DEFINE FastUnicodeData} // Use arrays for UpperCase/LowerCase/IsWordChar, they take 320K more memory
|
|
|
{$DEFINE UseFirstCharSet} // Enable optimization, which finds possible first chars of input string
|
|
|
{$DEFINE RegExpPCodeDump} // Enable method Dump() to show opcode as string
|
|
|
{$IFNDEF FPC} // Not supported in FreePascal
|
|
|
{$DEFINE reRealExceptionAddr} // Exceptions will point to appropriate source line, not to Error procedure
|
|
|
{$ENDIF}
|
|
|
{$DEFINE ComplexBraces} // Support braces in complex cases
|
|
|
-{$IFNDEF Unicode}
|
|
|
+{$IFNDEF UnicodeRE}
|
|
|
{$UNDEF UnicodeEx}
|
|
|
{$UNDEF FastUnicodeData}
|
|
|
{$ENDIF}
|
|
@@ -107,24 +107,30 @@ interface
|
|
|
{$IFDEF FPC} {$DEFINE InlineFuncs} {$ENDIF}
|
|
|
|
|
|
uses
|
|
|
- Classes, // TStrings in Split method
|
|
|
SysUtils, // Exception
|
|
|
{$IFDEF D2009}
|
|
|
- {$IFDEF D_XE}
|
|
|
+ {$IFDEF D_XE2}
|
|
|
System.Character,
|
|
|
{$ELSE}
|
|
|
Character,
|
|
|
{$ENDIF}
|
|
|
{$ENDIF}
|
|
|
- Math;
|
|
|
+ Classes; // TStrings in Split method
|
|
|
|
|
|
type
|
|
|
{$IFNDEF FPC}
|
|
|
// Delphi doesn't have PtrInt but has NativeInt
|
|
|
- PtrInt = NativeInt;
|
|
|
- PtrUInt = NativeInt;
|
|
|
+ // but unfortunately NativeInt is declared wrongly in several versions
|
|
|
+ {$IF SizeOf(Pointer)=4}
|
|
|
+ PtrInt = Integer;
|
|
|
+ PtrUInt = Cardinal;
|
|
|
+ {$ELSE}
|
|
|
+ PtrInt = Int64;
|
|
|
+ PtrUInt = UInt64;
|
|
|
+ {$IFEND}
|
|
|
{$ENDIF}
|
|
|
- {$IFDEF UniCode}
|
|
|
+
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
PRegExprChar = PWideChar;
|
|
|
{$IFDEF FPC}
|
|
|
RegExprString = UnicodeString;
|
|
@@ -137,9 +143,9 @@ type
|
|
|
{$ENDIF}
|
|
|
REChar = WideChar;
|
|
|
{$ELSE}
|
|
|
- PRegExprChar = PChar;
|
|
|
+ PRegExprChar = PAnsiChar;
|
|
|
RegExprString = AnsiString;
|
|
|
- REChar = Char;
|
|
|
+ REChar = AnsiChar;
|
|
|
{$ENDIF}
|
|
|
TREOp = REChar; // internal opcode type
|
|
|
PREOp = ^TREOp;
|
|
@@ -176,7 +182,7 @@ const
|
|
|
{$IFDEF UseLineSep}
|
|
|
// default value for LineSeparators
|
|
|
RegExprLineSeparators: RegExprString = #$d#$a#$b#$c
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
+ #$2028#$2029#$85
|
|
|
{$ENDIF};
|
|
|
{$ENDIF}
|
|
@@ -184,7 +190,7 @@ const
|
|
|
// Tab and Unicode category "Space Separator":
|
|
|
// https://www.compart.com/en/unicode/category/Zs
|
|
|
RegExprHorzSeparators: RegExprString = #9#$20#$A0
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
+ #$1680#$2000#$2001#$2002#$2003#$2004#$2005#$2006#$2007#$2008#$2009#$200A#$202F#$205F#$3000
|
|
|
{$ENDIF};
|
|
|
|
|
@@ -195,14 +201,18 @@ const
|
|
|
RegExprLookbehindIsAtomic: boolean = True;
|
|
|
|
|
|
const
|
|
|
- RegexMaxGroups = 90;
|
|
|
// Max number of groups.
|
|
|
// Be carefull - don't use values which overflow OP_CLOSE* opcode
|
|
|
// (in this case you'll get compiler error).
|
|
|
// Big value causes slower work and more stack required.
|
|
|
- RegexMaxMaxGroups = 255;
|
|
|
+ RegexMaxGroups = 90;
|
|
|
+
|
|
|
// Max possible value for RegexMaxGroups.
|
|
|
// Don't change it! It's defined by internal TRegExpr design.
|
|
|
+ RegexMaxMaxGroups = 255;
|
|
|
+
|
|
|
+ // Max depth of recursion for (?R) and (?1)..(?9)
|
|
|
+ RegexMaxRecursion = 20;
|
|
|
|
|
|
{$IFDEF ComplexBraces}
|
|
|
const
|
|
@@ -251,13 +261,17 @@ type
|
|
|
TRegExprInvertCaseFunction = function(const Ch: REChar): REChar of object;
|
|
|
{$ENDIF}
|
|
|
|
|
|
+ TRegExprBounds = record
|
|
|
+ GrpStart: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to group start in InputString
|
|
|
+ GrpEnd: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to group end in InputString
|
|
|
+ end;
|
|
|
+ TRegExprBoundsArray = array[0 .. RegexMaxRecursion] of TRegExprBounds;
|
|
|
+
|
|
|
{ TRegExpr }
|
|
|
|
|
|
TRegExpr = class
|
|
|
private
|
|
|
- GrpStart: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to group start in InputString
|
|
|
- GrpEnd: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to group end in InputString
|
|
|
-
|
|
|
+ GrpBounds: TRegExprBoundsArray;
|
|
|
GrpIndexes: array [0 .. RegexMaxGroups - 1] of integer; // map global group index to _capturing_ group index
|
|
|
GrpNames: array [0 .. RegexMaxGroups - 1] of RegExprString; // names of groups, if non-empty
|
|
|
GrpAtomic: array [0 .. RegexMaxGroups - 1] of boolean; // group[i] is atomic (filled in Compile)
|
|
@@ -306,6 +320,7 @@ type
|
|
|
fRegexStart: PRegExprChar; // pointer to first char of regex
|
|
|
fRegexEnd: PRegExprChar; // pointer after last char of regex
|
|
|
regCurrentGrp: integer; // index of group handling by OP_OPEN* opcode
|
|
|
+ regRecursion: integer; // current level of recursion (?R) (?1); always 0 if no recursion is used
|
|
|
|
|
|
// work variables for compiler's routines
|
|
|
regParse: PRegExprChar; // pointer to currently handling char of regex
|
|
@@ -347,8 +362,10 @@ type
|
|
|
// Exec() param ASlowChecks is set to True, when Length(InputString)<SlowChecksSizeMax
|
|
|
// This ASlowChecks enables to use regMustString optimization
|
|
|
|
|
|
- {$IFNDEF UniCode}
|
|
|
- fLineSepArray: array[byte] of boolean;
|
|
|
+ {$IFDEF UseLineSep}
|
|
|
+ {$IFNDEF UnicodeRE}
|
|
|
+ fLineSepArray: array[byte] of boolean;
|
|
|
+ {$ENDIF}
|
|
|
{$ENDIF}
|
|
|
|
|
|
CharCheckers: TRegExprCharCheckerArray;
|
|
@@ -380,7 +397,6 @@ type
|
|
|
procedure SetUseOsLineEndOnReplace(AValue: boolean);
|
|
|
{$ENDIF}
|
|
|
|
|
|
- function GetUseOsLineEndOnReplace: Boolean;
|
|
|
procedure InitCharCheckers;
|
|
|
function CharChecker_Word(ch: REChar): boolean;
|
|
|
function CharChecker_NotWord(ch: REChar): boolean;
|
|
@@ -602,7 +618,7 @@ type
|
|
|
function ExecPos(AOffset: integer; ATryOnce: boolean): boolean; overload; deprecated 'Use modern form of ExecPos()';
|
|
|
class function InvertCaseFunction(const Ch: REChar): REChar; deprecated 'This has no effect now';
|
|
|
property InvertCase: TRegExprInvertCaseFunction read fInvertCase write fInvertCase; deprecated 'This has no effect now';
|
|
|
- property UseUnicodeWordDetection: boolean read fUseUnicodeWordDetection write fUseUnicodeWordDetection; deprecated 'This has no effect, use {$DEFINE Unicode} instead';
|
|
|
+ property UseUnicodeWordDetection: boolean read fUseUnicodeWordDetection write fUseUnicodeWordDetection; deprecated 'This has no effect, use {$DEFINE UnicodeRE} instead';
|
|
|
property LinePairedSeparator: RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; deprecated 'This has no effect now';
|
|
|
property EmptyInputRaisesError: boolean read fEmptyInputRaisesError write fEmptyInputRaisesError; deprecated 'This has no effect now';
|
|
|
property UseOsLineEndOnReplace: boolean read fUseOsLineEndOnReplace write SetUseOsLineEndOnReplace; deprecated 'Use property ReplaceLineEnd instead';
|
|
@@ -729,7 +745,6 @@ type
|
|
|
property ReplaceLineEnd: RegExprString read fReplaceLineEnd write fReplaceLineEnd;
|
|
|
|
|
|
property SlowChecksSizeMax: integer read fSlowChecksSizeMax write fSlowChecksSizeMax;
|
|
|
-
|
|
|
end;
|
|
|
|
|
|
type
|
|
@@ -804,7 +819,7 @@ function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
|
|
|
// n At position n was found closing bracket ')' without
|
|
|
// corresponding opening '('.
|
|
|
// If Result <> 0, then ASubExpr can contain empty items or illegal ones
|
|
|
-function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
|
|
|
+function RegExprSubExpressions(const ARegExpr: RegExprString; ASubExprs: TStrings;
|
|
|
AExtendedSyntax: boolean{$IFDEF DefParam} = False{$ENDIF}): integer;
|
|
|
|
|
|
implementation
|
|
@@ -817,7 +832,7 @@ uses
|
|
|
const
|
|
|
// TRegExpr.VersionMajor/Minor return values of these constants:
|
|
|
REVersionMajor = 1;
|
|
|
- REVersionMinor = 155;
|
|
|
+ REVersionMinor = 158;
|
|
|
|
|
|
OpKind_End = REChar(1);
|
|
|
OpKind_MetaClass = REChar(2);
|
|
@@ -833,7 +848,7 @@ const
|
|
|
RegExprUpperAzSet = [Ord('A') .. Ord('Z')];
|
|
|
RegExprAllAzSet = RegExprLowerAzSet + RegExprUpperAzSet;
|
|
|
RegExprSpaceSet = [Ord(' '), $9, $A, $D, $C];
|
|
|
- RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UniCode} + [$85] {$ENDIF};
|
|
|
+ RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UnicodeRE} + [$85] {$ENDIF};
|
|
|
RegExprHorzSeparatorsSet = [9, $20, $A0];
|
|
|
|
|
|
MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
|
|
@@ -882,11 +897,12 @@ const
|
|
|
{$ENDIF}
|
|
|
RENumberSz = SizeOf(LongInt) div SizeOf(REChar);
|
|
|
|
|
|
+type
|
|
|
+ PtrPair = {$IFDEF UnicodeRE} ^LongInt; {$ELSE} ^Word; {$ENDIF}
|
|
|
+
|
|
|
function IsPairedBreak(p: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
|
|
const
|
|
|
- cBreak = {$IFDEF Unicode} $000D000A; {$ELSE} $0D0A; {$ENDIF}
|
|
|
-type
|
|
|
- PtrPair = {$IFDEF Unicode} ^LongInt; {$ELSE} ^Word; {$ENDIF}
|
|
|
+ cBreak = {$IFDEF UnicodeRE} $000D000A; {$ELSE} $0D0A; {$ENDIF}
|
|
|
begin
|
|
|
Result := PtrPair(p)^ = cBreak;
|
|
|
end;
|
|
@@ -971,13 +987,13 @@ begin
|
|
|
Exit;
|
|
|
|
|
|
{$IFDEF FPC}
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
Result := UnicodeUpperCase(Ch)[1];
|
|
|
{$ELSE}
|
|
|
Result := AnsiUpperCase(Ch)[1];
|
|
|
{$ENDIF}
|
|
|
{$ELSE}
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
{$IFDEF D_XE4}
|
|
|
Result := Ch.ToUpper;
|
|
|
{$ELSE}
|
|
@@ -1003,13 +1019,13 @@ begin
|
|
|
Exit;
|
|
|
|
|
|
{$IFDEF FPC}
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
Result := UnicodeLowerCase(Ch)[1];
|
|
|
{$ELSE}
|
|
|
Result := AnsiLowerCase(Ch)[1];
|
|
|
{$ENDIF}
|
|
|
{$ELSE}
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
{$IFDEF D_XE4}
|
|
|
Result := Ch.ToLower;
|
|
|
{$ELSE}
|
|
@@ -1257,7 +1273,7 @@ begin
|
|
|
end; { of function QuoteRegExprMetaChars
|
|
|
-------------------------------------------------------------- }
|
|
|
|
|
|
-function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
|
|
|
+function RegExprSubExpressions(const ARegExpr: RegExprString; ASubExprs: TStrings;
|
|
|
AExtendedSyntax: boolean{$IFDEF DefParam} = False{$ENDIF}): integer;
|
|
|
type
|
|
|
TStackItemRec = record // ###0.945
|
|
@@ -1369,7 +1385,7 @@ begin
|
|
|
begin
|
|
|
// skip eXtended comments
|
|
|
while (i <= Len) and (ARegExpr[i] <> #$d) and (ARegExpr[i] <> #$a)
|
|
|
- // do not use [#$d, #$a] due to UniCode compatibility
|
|
|
+ // do not use [#$d, #$a] due to Unicode compatibility
|
|
|
do
|
|
|
Inc(i);
|
|
|
while (i + 1 <= Len) and
|
|
@@ -1484,7 +1500,7 @@ const
|
|
|
OP_SUBCALL = Succ(OP_CLOSE_LAST); // Call of subroutine; OP_SUBCALL+i is for group i
|
|
|
OP_SUBCALL_FIRST = Succ(OP_SUBCALL);
|
|
|
OP_SUBCALL_LAST =
|
|
|
- {$IFDEF Unicode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
TReOp(Ord(OP_SUBCALL) + RegexMaxGroups - 1);
|
|
|
{$ELSE}
|
|
|
High(REChar); // must fit to 0..255 range
|
|
@@ -1801,7 +1817,7 @@ end; { of procedure TRegExpr.SetExpression
|
|
|
function TRegExpr.GetSubExprCount: integer;
|
|
|
begin
|
|
|
// if nothing found, we must return -1 per TRegExpr docs
|
|
|
- if GrpStart[0] = nil then
|
|
|
+ if GrpBounds[0].GrpStart[0] = nil then
|
|
|
Result := -1
|
|
|
else
|
|
|
Result := GrpCount;
|
|
@@ -1810,8 +1826,8 @@ end;
|
|
|
function TRegExpr.GetMatchPos(Idx: integer): PtrInt;
|
|
|
begin
|
|
|
Idx := GrpIndexes[Idx];
|
|
|
- if (Idx >= 0) and (GrpStart[Idx] <> nil) then
|
|
|
- Result := GrpStart[Idx] - fInputStart + 1
|
|
|
+ if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) then
|
|
|
+ Result := GrpBounds[0].GrpStart[Idx] - fInputStart + 1
|
|
|
else
|
|
|
Result := -1;
|
|
|
end; { of function TRegExpr.GetMatchPos
|
|
@@ -1820,8 +1836,8 @@ end; { of function TRegExpr.GetMatchPos
|
|
|
function TRegExpr.GetMatchLen(Idx: integer): PtrInt;
|
|
|
begin
|
|
|
Idx := GrpIndexes[Idx];
|
|
|
- if (Idx >= 0) and (GrpStart[Idx] <> nil) then
|
|
|
- Result := GrpEnd[Idx] - GrpStart[Idx]
|
|
|
+ if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) then
|
|
|
+ Result := GrpBounds[0].GrpEnd[Idx] - GrpBounds[0].GrpStart[Idx]
|
|
|
else
|
|
|
Result := -1;
|
|
|
end; { of function TRegExpr.GetMatchLen
|
|
@@ -1831,8 +1847,8 @@ function TRegExpr.GetMatch(Idx: integer): RegExprString;
|
|
|
begin
|
|
|
Result := '';
|
|
|
Idx := GrpIndexes[Idx];
|
|
|
- if (Idx >= 0) and (GrpEnd[Idx] > GrpStart[Idx]) then
|
|
|
- SetString(Result, GrpStart[Idx], GrpEnd[Idx] - GrpStart[Idx]);
|
|
|
+ if (Idx >= 0) and (GrpBounds[0].GrpEnd[Idx] > GrpBounds[0].GrpStart[Idx]) then
|
|
|
+ SetString(Result, GrpBounds[0].GrpStart[Idx], GrpBounds[0].GrpEnd[Idx] - GrpBounds[0].GrpStart[Idx]);
|
|
|
end; { of function TRegExpr.GetMatch
|
|
|
-------------------------------------------------------------- }
|
|
|
|
|
@@ -2129,7 +2145,7 @@ end;
|
|
|
function TRegExpr.IsCustomLineSeparator(AChar: REChar): boolean;
|
|
|
begin
|
|
|
{$IFDEF UseLineSep}
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
Result := Pos(AChar, fLineSeparators) > 0;
|
|
|
{$ELSE}
|
|
|
Result := fLineSepArray[byte(AChar)];
|
|
@@ -2137,7 +2153,7 @@ begin
|
|
|
{$ELSE}
|
|
|
case AChar of
|
|
|
#$d, #$a,
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
#$85, #$2028, #$2029,
|
|
|
{$ENDIF}
|
|
|
#$b, #$c:
|
|
@@ -2164,7 +2180,7 @@ begin
|
|
|
case AChar of
|
|
|
#9, #$20, #$A0:
|
|
|
Result := True;
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
#$1680, #$2000 .. #$200A, #$202F, #$205F, #$3000:
|
|
|
Result := True;
|
|
|
{$ENDIF}
|
|
@@ -2178,7 +2194,7 @@ begin
|
|
|
case AChar of
|
|
|
#$d, #$a, #$b, #$c:
|
|
|
Result := True;
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
#$2028, #$2029, #$85:
|
|
|
Result := True;
|
|
|
{$ENDIF}
|
|
@@ -2211,12 +2227,12 @@ end; { of procedure TRegExpr.Compile
|
|
|
|
|
|
{$IFDEF UseLineSep}
|
|
|
procedure TRegExpr.InitLineSepArray;
|
|
|
-{$IFNDEF UniCode}
|
|
|
+{$IFNDEF UnicodeRE}
|
|
|
var
|
|
|
i: integer;
|
|
|
{$ENDIF}
|
|
|
begin
|
|
|
- {$IFNDEF UniCode}
|
|
|
+ {$IFNDEF UnicodeRE}
|
|
|
FillChar(fLineSepArray, SizeOf(fLineSepArray), 0);
|
|
|
for i := 1 to Length(fLineSeparators) do
|
|
|
fLineSepArray[byte(fLineSeparators[i])] := True;
|
|
@@ -2270,7 +2286,7 @@ begin
|
|
|
// work around PWideChar subtraction bug (Delphi uses
|
|
|
// shr after subtraction to calculate widechar distance %-( )
|
|
|
// so, if difference is negative we have .. the "feature" :(
|
|
|
- // I could wrap it in $IFDEF UniCode, but I didn't because
|
|
|
+ // I could wrap it in $IFDEF UnicodeRE, but I didn't because
|
|
|
// "P – Q computes the difference between the address given
|
|
|
// by P (the higher address) and the address given by Q (the
|
|
|
// lower address)" - Delphi help quotation.
|
|
@@ -2490,7 +2506,7 @@ const
|
|
|
FLAG_SIMPLE = 2; // Simple enough to be OP_STAR/OP_PLUS/OP_BRACES operand
|
|
|
FLAG_SPECSTART = 4; // Starts with * or +
|
|
|
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
RusRangeLoLow = #$430; // 'а'
|
|
|
RusRangeLoHigh = #$44F; // 'я'
|
|
|
RusRangeHiLow = #$410; // 'А'
|
|
@@ -2613,7 +2629,7 @@ begin
|
|
|
for i := 1 to Length(fWordChars) do
|
|
|
begin
|
|
|
ch := fWordChars[i];
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
if Ord(ch) <= $FF then
|
|
|
{$ENDIF}
|
|
|
Include(ARes, byte(ch));
|
|
@@ -2635,7 +2651,7 @@ begin
|
|
|
for i := 1 to Length(fSpaceChars) do
|
|
|
begin
|
|
|
ch := fSpaceChars[i];
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
if Ord(ch) <= $FF then
|
|
|
{$ENDIF}
|
|
|
Include(ARes, byte(ch));
|
|
@@ -2664,9 +2680,12 @@ begin
|
|
|
ch := ABuffer^;
|
|
|
Inc(ABuffer);
|
|
|
ch2 := ABuffer^;
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
+ if Ord(ch2) > $FF then
|
|
|
+ ch2 := REChar($FF);
|
|
|
+ {$ENDIF}
|
|
|
Inc(ABuffer);
|
|
|
- for i := Ord(ch) to
|
|
|
- {$IFDEF UniCode} Min(Ord(ch2), $FF) {$ELSE} Ord(ch2) {$ENDIF} do
|
|
|
+ for i := Ord(ch) to Ord(ch2) do
|
|
|
begin
|
|
|
Include(ARes, byte(i));
|
|
|
if AIgnoreCase then
|
|
@@ -2750,7 +2769,7 @@ begin
|
|
|
begin
|
|
|
ch := ABuffer^;
|
|
|
Inc(ABuffer);
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
if Ord(ch) <= $FF then
|
|
|
{$ENDIF}
|
|
|
begin
|
|
@@ -2951,7 +2970,6 @@ begin
|
|
|
end; { of function TRegExpr.CompileRegExpr
|
|
|
-------------------------------------------------------------- }
|
|
|
|
|
|
-
|
|
|
function TRegExpr.ParseReg(InBrackets: boolean; var FlagParse: integer): PRegExprChar;
|
|
|
// regular expression, i.e. main body or parenthesized thing
|
|
|
// Caller must absorb opening parenthesis.
|
|
@@ -3608,6 +3626,7 @@ begin
|
|
|
FlagTemp := 0;
|
|
|
FlagParse := FLAG_WORST;
|
|
|
AddrOfLen := nil;
|
|
|
+ GrpIndex := -1;
|
|
|
|
|
|
Inc(regParse);
|
|
|
case (regParse - 1)^ of
|
|
@@ -4323,7 +4342,10 @@ var
|
|
|
TheMax: PtrInt; // PtrInt, gets diff of 2 pointers
|
|
|
InvChar: REChar;
|
|
|
CurStart, CurEnd: PRegExprChar;
|
|
|
- ArrayIndex, i: integer;
|
|
|
+ ArrayIndex: integer;
|
|
|
+ {$IFDEF UnicodeEx}
|
|
|
+ i: integer;
|
|
|
+ {$ENDIF}
|
|
|
begin
|
|
|
Result := 0;
|
|
|
scan := regInput; // points into InputString
|
|
@@ -4391,10 +4413,10 @@ begin
|
|
|
ArrayIndex := GrpIndexes[Ord(opnd^)];
|
|
|
if ArrayIndex < 0 then
|
|
|
Exit;
|
|
|
- CurStart := GrpStart[ArrayIndex];
|
|
|
+ CurStart := GrpBounds[regRecursion].GrpStart[ArrayIndex];
|
|
|
if CurStart = nil then
|
|
|
Exit;
|
|
|
- CurEnd := GrpEnd[ArrayIndex];
|
|
|
+ CurEnd := GrpBounds[regRecursion].GrpEnd[ArrayIndex];
|
|
|
if CurEnd = nil then
|
|
|
Exit;
|
|
|
repeat
|
|
@@ -4416,10 +4438,10 @@ begin
|
|
|
ArrayIndex := GrpIndexes[Ord(opnd^)];
|
|
|
if ArrayIndex < 0 then
|
|
|
Exit;
|
|
|
- CurStart := GrpStart[ArrayIndex];
|
|
|
+ CurStart := GrpBounds[regRecursion].GrpStart[ArrayIndex];
|
|
|
if CurStart = nil then
|
|
|
Exit;
|
|
|
- CurEnd := GrpEnd[ArrayIndex];
|
|
|
+ CurEnd := GrpBounds[regRecursion].GrpEnd[ArrayIndex];
|
|
|
if CurEnd = nil then
|
|
|
Exit;
|
|
|
repeat
|
|
@@ -4711,19 +4733,19 @@ function TRegExpr.MatchPrim(prog: PRegExprChar): boolean;
|
|
|
// by recursion.
|
|
|
|
|
|
var
|
|
|
- scan: PRegExprChar; // Current node.
|
|
|
- next: PRegExprChar; // Next node.
|
|
|
+ scan: PRegExprChar; // current node
|
|
|
+ next: PRegExprChar; // next node
|
|
|
Len: PtrInt;
|
|
|
- opnd: PRegExprChar;
|
|
|
+ opnd, opGrpEnd: PRegExprChar;
|
|
|
no: integer;
|
|
|
save: PRegExprChar;
|
|
|
saveCurrentGrp: integer;
|
|
|
nextch: REChar;
|
|
|
BracesMin, BracesMax: integer;
|
|
|
- // we use integer instead of TREBracesArg for better support */+
|
|
|
+ // we use integer instead of TREBracesArg to better support */+
|
|
|
{$IFDEF ComplexBraces}
|
|
|
- SavedLoopStack: TRegExprLoopStack; // :(( very bad for recursion
|
|
|
- SavedLoopStackIdx: integer; // ###0.925
|
|
|
+ SavedLoopStack: TRegExprLoopStack; // very bad for recursion
|
|
|
+ SavedLoopStackIdx: integer;
|
|
|
{$ENDIF}
|
|
|
bound1, bound2: boolean;
|
|
|
checkAtomicGroup: boolean;
|
|
@@ -4984,13 +5006,14 @@ begin
|
|
|
no := GrpIndexes[no];
|
|
|
if no < 0 then
|
|
|
Exit;
|
|
|
- if GrpStart[no] = nil then
|
|
|
+ opnd := GrpBounds[regRecursion].GrpStart[no];
|
|
|
+ if opnd = nil then
|
|
|
Exit;
|
|
|
- if GrpEnd[no] = nil then
|
|
|
+ opGrpEnd := GrpBounds[regRecursion].GrpEnd[no];
|
|
|
+ if opGrpEnd = nil then
|
|
|
Exit;
|
|
|
save := regInput;
|
|
|
- opnd := GrpStart[no];
|
|
|
- while opnd < GrpEnd[no] do
|
|
|
+ while opnd < opGrpEnd do
|
|
|
begin
|
|
|
if (save >= fInputEnd) or (save^ <> opnd^) then
|
|
|
Exit;
|
|
@@ -5006,13 +5029,14 @@ begin
|
|
|
no := GrpIndexes[no];
|
|
|
if no < 0 then
|
|
|
Exit;
|
|
|
- if GrpStart[no] = nil then
|
|
|
+ opnd := GrpBounds[regRecursion].GrpStart[no];
|
|
|
+ if opnd = nil then
|
|
|
Exit;
|
|
|
- if GrpEnd[no] = nil then
|
|
|
+ opGrpEnd := GrpBounds[regRecursion].GrpEnd[no];
|
|
|
+ if opGrpEnd = nil then
|
|
|
Exit;
|
|
|
save := regInput;
|
|
|
- opnd := GrpStart[no];
|
|
|
- while opnd < GrpEnd[no] do
|
|
|
+ while opnd < opGrpEnd do
|
|
|
begin
|
|
|
if (save >= fInputEnd) or
|
|
|
((save^ <> opnd^) and (save^ <> InvertCase(opnd^))) then
|
|
@@ -5082,11 +5106,11 @@ begin
|
|
|
begin
|
|
|
no := Ord(scan^) - Ord(OP_OPEN);
|
|
|
regCurrentGrp := no;
|
|
|
- save := GrpStart[no]; // ###0.936
|
|
|
- GrpStart[no] := regInput; // ###0.936
|
|
|
+ save := GrpBounds[regRecursion].GrpStart[no];
|
|
|
+ GrpBounds[regRecursion].GrpStart[no] := regInput;
|
|
|
Result := MatchPrim(next);
|
|
|
- if not Result then // ###0.936
|
|
|
- GrpStart[no] := save;
|
|
|
+ if not Result then
|
|
|
+ GrpBounds[regRecursion].GrpStart[no] := save;
|
|
|
// handle negative lookahead
|
|
|
if regLookaheadNeg then
|
|
|
if no = regLookaheadGroup then
|
|
@@ -5096,11 +5120,11 @@ begin
|
|
|
begin
|
|
|
// we need zero length of "lookahead group",
|
|
|
// it is later used to adjust the match
|
|
|
- GrpStart[no] := regInput;
|
|
|
- GrpEnd[no]:= regInput;
|
|
|
+ GrpBounds[regRecursion].GrpStart[no] := regInput;
|
|
|
+ GrpBounds[regRecursion].GrpEnd[no]:= regInput;
|
|
|
end
|
|
|
else
|
|
|
- GrpStart[no] := save;
|
|
|
+ GrpBounds[regRecursion].GrpStart[no] := save;
|
|
|
end;
|
|
|
Exit;
|
|
|
end;
|
|
@@ -5113,8 +5137,8 @@ begin
|
|
|
// (we are here because some OP_BRANCH is matched)
|
|
|
if GrpAtomic[no] then
|
|
|
GrpAtomicDone[no] := True;
|
|
|
- save := GrpEnd[no]; // ###0.936
|
|
|
- GrpEnd[no] := regInput; // ###0.936
|
|
|
+ save := GrpBounds[regRecursion].GrpEnd[no];
|
|
|
+ GrpBounds[regRecursion].GrpEnd[no] := regInput;
|
|
|
|
|
|
// if we are in OP_SUBCALL* call, it called OP_OPEN*, so we must return
|
|
|
// in OP_CLOSE, without going to next opcode
|
|
@@ -5126,7 +5150,7 @@ begin
|
|
|
|
|
|
Result := MatchPrim(next);
|
|
|
if not Result then // ###0.936
|
|
|
- GrpEnd[no] := save;
|
|
|
+ GrpBounds[regRecursion].GrpEnd[no] := save;
|
|
|
Exit;
|
|
|
end;
|
|
|
|
|
@@ -5398,7 +5422,15 @@ begin
|
|
|
OP_RECUR:
|
|
|
begin
|
|
|
// call opcode start
|
|
|
- if not MatchPrim(regCodeWork) then Exit;
|
|
|
+ if regRecursion < RegexMaxRecursion then
|
|
|
+ begin
|
|
|
+ Inc(regRecursion);
|
|
|
+ bound1 := MatchPrim(regCodeWork);
|
|
|
+ Dec(regRecursion);
|
|
|
+ end
|
|
|
+ else
|
|
|
+ bound1 := False;
|
|
|
+ if not bound1 then Exit;
|
|
|
end;
|
|
|
|
|
|
OP_SUBCALL_FIRST .. OP_SUBCALL_LAST:
|
|
@@ -5408,15 +5440,19 @@ begin
|
|
|
if no < 0 then Exit;
|
|
|
save := GrpOpCodes[no];
|
|
|
if save = nil then Exit;
|
|
|
- checkAtomicGroup := GrpSubCalled[no];
|
|
|
- // mark group in GrpSubCalled array so opcode can detect subcall
|
|
|
- GrpSubCalled[no] := True;
|
|
|
- if not MatchPrim(save) then
|
|
|
+ if regRecursion < RegexMaxRecursion then
|
|
|
begin
|
|
|
+ // mark group in GrpSubCalled array so opcode can detect subcall
|
|
|
+ checkAtomicGroup := GrpSubCalled[no];
|
|
|
+ GrpSubCalled[no] := True;
|
|
|
+ Inc(regRecursion);
|
|
|
+ bound1 := MatchPrim(save);
|
|
|
+ Dec(regRecursion);
|
|
|
GrpSubCalled[no] := checkAtomicGroup;
|
|
|
- Exit;
|
|
|
- end;
|
|
|
- GrpSubCalled[no] := checkAtomicGroup;
|
|
|
+ end
|
|
|
+ else
|
|
|
+ bound1 := False;
|
|
|
+ if not bound1 then Exit;
|
|
|
end;
|
|
|
|
|
|
else
|
|
@@ -5488,27 +5524,26 @@ begin
|
|
|
regInput := APos;
|
|
|
regCurrentGrp := -1;
|
|
|
regNestedCalls := 0;
|
|
|
+ regRecursion := 0;
|
|
|
Result := MatchPrim(regCodeWork);
|
|
|
if Result then
|
|
|
begin
|
|
|
- GrpStart[0] := APos;
|
|
|
- GrpEnd[0] := regInput;
|
|
|
+ GrpBounds[0].GrpStart[0] := APos;
|
|
|
+ GrpBounds[0].GrpEnd[0] := regInput;
|
|
|
|
|
|
// with lookbehind, increase found position by the len of group=1
|
|
|
if regLookbehind then
|
|
|
- Inc(GrpStart[0], GrpEnd[1] - GrpStart[1]);
|
|
|
+ Inc(GrpBounds[0].GrpStart[0], GrpBounds[0].GrpEnd[1] - GrpBounds[0].GrpStart[1]);
|
|
|
|
|
|
// with lookahead, decrease ending by the len of group=regLookaheadGroup
|
|
|
if regLookahead and (regLookaheadGroup > 0) then
|
|
|
- Dec(GrpEnd[0], GrpEnd[regLookaheadGroup] - GrpStart[regLookaheadGroup]);
|
|
|
+ Dec(GrpBounds[0].GrpEnd[0], GrpBounds[0].GrpEnd[regLookaheadGroup] - GrpBounds[0].GrpStart[regLookaheadGroup]);
|
|
|
end;
|
|
|
end;
|
|
|
|
|
|
procedure TRegExpr.ClearMatches;
|
|
|
begin
|
|
|
- FillChar(GrpStart, SizeOf(GrpStart), 0);
|
|
|
- FillChar(GrpEnd, SizeOf(GrpEnd), 0);
|
|
|
-
|
|
|
+ FillChar(GrpBounds, SizeOf(GrpBounds), 0);
|
|
|
FillChar(GrpAtomicDone, SizeOf(GrpAtomicDone), 0);
|
|
|
FillChar(GrpSubCalled, SizeOf(GrpSubCalled), 0);
|
|
|
end;
|
|
@@ -5517,9 +5552,7 @@ procedure TRegExpr.ClearInternalIndexes;
|
|
|
var
|
|
|
i: integer;
|
|
|
begin
|
|
|
- FillChar(GrpStart, SizeOf(GrpStart), 0);
|
|
|
- FillChar(GrpEnd, SizeOf(GrpEnd), 0);
|
|
|
-
|
|
|
+ FillChar(GrpBounds, SizeOf(GrpBounds), 0);
|
|
|
FillChar(GrpAtomic, SizeOf(GrpAtomic), 0);
|
|
|
FillChar(GrpAtomicDone, SizeOf(GrpAtomicDone), 0);
|
|
|
FillChar(GrpSubCalled, SizeOf(GrpSubCalled), 0);
|
|
@@ -5588,7 +5621,7 @@ begin
|
|
|
if ATryOnce or (regAnchored <> #0) then
|
|
|
begin
|
|
|
{$IFDEF UseFirstCharSet}
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
if Ord(Ptr^) <= $FF then
|
|
|
{$ENDIF}
|
|
|
if not FirstCharArray[byte(Ptr^)] then
|
|
@@ -5619,7 +5652,7 @@ begin
|
|
|
end;
|
|
|
|
|
|
{$IFDEF UseFirstCharSet}
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
if Ord(Ptr^) <= $FF then
|
|
|
{$ENDIF}
|
|
|
if not FirstCharArray[byte(Ptr^)] then
|
|
@@ -5639,8 +5672,8 @@ var
|
|
|
PtrBegin, PtrEnd: PRegExprChar;
|
|
|
Offset: PtrInt;
|
|
|
begin
|
|
|
- PtrBegin := GrpStart[0];
|
|
|
- PtrEnd := GrpEnd[0];
|
|
|
+ PtrBegin := GrpBounds[0].GrpStart[0];
|
|
|
+ PtrEnd := GrpBounds[0].GrpEnd[0];
|
|
|
if (PtrBegin = nil) or (PtrEnd = nil) then
|
|
|
begin
|
|
|
Error(reeExecNextWithoutExec);
|
|
@@ -5711,6 +5744,7 @@ var
|
|
|
GrpName: RegExprString;
|
|
|
begin
|
|
|
Result := 0;
|
|
|
+ GrpName := '';
|
|
|
p := APtr;
|
|
|
Delimited := (p < TemplateEnd) and (p^ = '{');
|
|
|
if Delimited then
|
|
@@ -5788,7 +5822,7 @@ begin
|
|
|
if GroupFound then
|
|
|
begin
|
|
|
if n >= 0 then
|
|
|
- Inc(ResultLen, GrpEnd[n] - GrpStart[n]);
|
|
|
+ Inc(ResultLen, GrpBounds[0].GrpEnd[n] - GrpBounds[0].GrpStart[n]);
|
|
|
end
|
|
|
else
|
|
|
begin
|
|
@@ -5846,8 +5880,8 @@ begin
|
|
|
begin
|
|
|
if n >= 0 then
|
|
|
begin
|
|
|
- p0 := GrpStart[n];
|
|
|
- p1 := GrpEnd[n];
|
|
|
+ p0 := GrpBounds[0].GrpStart[n];
|
|
|
+ p1 := GrpBounds[0].GrpEnd[n];
|
|
|
end
|
|
|
else
|
|
|
p1 := p0;
|
|
@@ -6010,7 +6044,10 @@ var
|
|
|
opnd: PRegExprChar;
|
|
|
Oper: TREOp;
|
|
|
ch: REChar;
|
|
|
- min_cnt, i: integer;
|
|
|
+ min_cnt: integer;
|
|
|
+ {$IFDEF UseLineSep}
|
|
|
+ i: integer;
|
|
|
+ {$ENDIF}
|
|
|
TempSet: TRegExprCharset;
|
|
|
begin
|
|
|
TempSet := [];
|
|
@@ -6127,7 +6164,7 @@ begin
|
|
|
OP_EXACTLYCI:
|
|
|
begin
|
|
|
ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
if Ord(ch) <= $FF then
|
|
|
{$ENDIF}
|
|
|
begin
|
|
@@ -6140,7 +6177,7 @@ begin
|
|
|
OP_EXACTLY:
|
|
|
begin
|
|
|
ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
|
|
|
- {$IFDEF UniCode}
|
|
|
+ {$IFDEF UnicodeRE}
|
|
|
if Ord(ch) <= $FF then
|
|
|
{$ENDIF}
|
|
|
Include(FirstCharSet, byte(ch));
|
|
@@ -6336,11 +6373,6 @@ begin
|
|
|
end;
|
|
|
end;
|
|
|
|
|
|
-function TRegExpr.GetUseOsLineEndOnReplace: Boolean;
|
|
|
-begin
|
|
|
- Result:=fReplaceLineEnd=sLineBreak;
|
|
|
-end;
|
|
|
-
|
|
|
function TRegExpr.CharChecker_Word(ch: REChar): boolean;
|
|
|
begin
|
|
|
Result := IsWordChar(ch);
|
|
@@ -6864,7 +6896,7 @@ begin
|
|
|
end;
|
|
|
|
|
|
{$IFDEF reRealExceptionAddr}
|
|
|
-{ OPTIMIZATION ON} // specified via (fp)make
|
|
|
+{$OPTIMIZATION ON}
|
|
|
// ReturnAddr works correctly only if compiler optimization is ON
|
|
|
// I placed this method at very end of unit because there are no
|
|
|
// way to restore compiler optimization flag ...
|