Browse Source

* Added UseUnicodeWordDetection

git-svn-id: trunk@39564 -
michael 7 years ago
parent
commit
71bbab3512
3 changed files with 86 additions and 9 deletions
  1. 1 0
      .gitattributes
  2. 27 0
      packages/regexpr/examples/demowd.pp
  3. 58 9
      packages/regexpr/src/regexpr.pas

+ 1 - 0
.gitattributes

@@ -7256,6 +7256,7 @@ packages/regexpr/Makefile.fpc svneol=native#text/plain
 packages/regexpr/Makefile.fpc.fpcmake svneol=native#text/plain
 packages/regexpr/Makefile.fpc.fpcmake svneol=native#text/plain
 packages/regexpr/examples/Makefile svneol=native#text/plain
 packages/regexpr/examples/Makefile svneol=native#text/plain
 packages/regexpr/examples/Makefile.fpc svneol=native#text/plain
 packages/regexpr/examples/Makefile.fpc svneol=native#text/plain
+packages/regexpr/examples/demowd.pp svneol=native#text/plain
 packages/regexpr/examples/splitwords.lpi svneol=native#text/plain
 packages/regexpr/examples/splitwords.lpi svneol=native#text/plain
 packages/regexpr/examples/splitwords.pp svneol=native#text/plain
 packages/regexpr/examples/splitwords.pp svneol=native#text/plain
 packages/regexpr/examples/testreg1.pp svneol=native#text/plain
 packages/regexpr/examples/testreg1.pp svneol=native#text/plain

+ 27 - 0
packages/regexpr/examples/demowd.pp

@@ -0,0 +1,27 @@
+{
+  Program to demonstrate UseUnicodeWordDetection property. 
+  Run this program as
+  testwd
+  testwd 1
+  to see the difference  
+}
+{$mode objfpc}
+{$h+}
+uses cwstring,uregexpr;
+
+Function ReplaceRegExpr(ARegExpr, AInputStr, AReplaceStr : Unicodestring) : string;
+
+begin
+  with TRegExpr.Create do
+    try
+      UseUnicodeWordDetection:=ParamStr(1)='1'; 
+      Expression := ARegExpr;
+      Result:=Replace (AInputStr, AReplaceStr, True);
+    finally
+      Free;
+    end;
+end;
+
+begin    
+  Writeln(ReplaceRegExpr('\w+', UTF8Decode('test слово ŕáćéí ϸϬϛ ュユョ'), '<$0>'));
+end.

+ 58 - 9
packages/regexpr/src/regexpr.pas

@@ -49,6 +49,7 @@ unit RegExpr;
 interface
 interface
 
 
 {off $DEFINE DebugSynRegExpr}
 {off $DEFINE DebugSynRegExpr}
+{$DEFINE UnicodeWordDetection}
 
 
 {$IFDEF FPC}
 {$IFDEF FPC}
  {$MODE DELPHI} // Delphi-compatible mode in FreePascal
  {$MODE DELPHI} // Delphi-compatible mode in FreePascal
@@ -101,6 +102,9 @@ interface
  {$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
  {$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
 {$ENDIF}
 {$ENDIF}
 {$DEFINE UseOsLineEndOnReplace} // On Replace if replace-with has "\n", use System.LineEnding (#10 #13 or #13#10); else use #10
 {$DEFINE UseOsLineEndOnReplace} // On Replace if replace-with has "\n", use System.LineEnding (#10 #13 or #13#10); else use #10
+{$IFNDEF UNICODE}
+{$UNDEF UnicodeWordDetection}
+{$ENDIF}
 
 
 // ======== Define Pascal-language options
 // ======== Define Pascal-language options
 // Define 'UseAsserts' option (do not edit this definitions).
 // Define 'UseAsserts' option (do not edit this definitions).
@@ -292,7 +296,12 @@ type
     {$IFNDEF UniCode}
     {$IFNDEF UniCode}
     fLineSeparatorsSet : set of REChar;
     fLineSeparatorsSet : set of REChar;
     {$ENDIF}
     {$ENDIF}
-    Function IsSpaceChar(AChar : PRegExprChar) : Boolean; inline;
+    {$IFDEF UnicodeWordDetection}
+    FUseUnicodeWordDetection : Boolean;
+    function IsUnicodeWordChar(AChar : REChar) : Boolean;
+    {$ENDIF}
+    function IsWordChar(AChar : REChar) : Boolean; inline;
+    function IsSpaceChar(AChar : PRegExprChar) : Boolean; inline;
 
 
     // Mark programm as having to be [re]compiled
     // Mark programm as having to be [re]compiled
     procedure InvalidateProgramm;
     procedure InvalidateProgramm;
@@ -574,6 +583,10 @@ type
     // global constant)
     // global constant)
     property WordChars : RegExprString read fWordChars write fWordChars; //###0.929
     property WordChars : RegExprString read fWordChars write fWordChars; //###0.929
 
 
+    {$IFDEF UnicodeWordDetection}
+    // If set to true, in addition to using WordChars, a heuristic to detect unicode word letters is used for \w
+    Property UseUnicodeWordDetection : Boolean Read FUseUnicodeWordDetection Write FUseUnicodeWordDetection;
+    {$ENDIF}
     // line separators (like \n in Unix)
     // line separators (like \n in Unix)
     property LineSeparators : RegExprString read fLineSeparators write SetLineSeparators; //###0.941
     property LineSeparators : RegExprString read fLineSeparators write SetLineSeparators; //###0.941
 
 
@@ -661,6 +674,10 @@ function RegExprSubExpressions (const ARegExpr : string;
 implementation
 implementation
 
 
 {$IFDEF FPC}
 {$IFDEF FPC}
+{$IFDEF UnicodeWordDetection}
+uses
+  UnicodeData;
+{$ENDIF}
 {$ELSE}
 {$ELSE}
 uses
 uses
 {$IFDEF SYN_WIN32}
 {$IFDEF SYN_WIN32}
@@ -1205,6 +1222,9 @@ constructor TRegExpr.Create;
 
 
   regexpbeg := nil;
   regexpbeg := nil;
   fExprIsCompiled := false;
   fExprIsCompiled := false;
+  {$IFDEF UnicodeWordDetection}
+  FUseUnicodeWordDetection:=False;
+  {$ENDIF}
 
 
   ModifierI := RegExprModifierI;
   ModifierI := RegExprModifierI;
   ModifierR := RegExprModifierR;
   ModifierR := RegExprModifierR;
@@ -1481,6 +1501,35 @@ procedure TRegExpr.SetModifier (AIndex : integer; ASet : boolean);
 {==================== Compiler section =======================}
 {==================== Compiler section =======================}
 {=============================================================}
 {=============================================================}
 
 
+{$IFDEF UnicodeWordDetection}
+function TRegExpr.IsUnicodeWordChar(AChar: REChar): Boolean;
+var
+  NType: byte;
+begin
+  if Ord(AChar)<128 then
+    exit(false)
+  else
+  if Ord(AChar)>=LOW_SURROGATE_BEGIN then
+    exit(false)
+  else
+  begin
+    NType:= GetProps(Ord(AChar))^.Category;
+    Result:= (NType<=UGC_OtherNumber);
+  end;
+end;
+{$ENDIF}
+
+
+function TRegExpr.IsWordChar(AChar: REChar): Boolean; inline;
+begin
+  Result := Pos(AChar, fWordChars)>0;
+  {$IFDEF UnicodeWordDetection}
+  If Not Result and UseUnicodeWordDetection then
+    Result:=IsUnicodeWordChar(aChar);
+  {$ENDIF}
+end;
+
+
 function TRegExpr.IsSpaceChar(AChar: PRegExprChar): Boolean;
 function TRegExpr.IsSpaceChar(AChar: PRegExprChar): Boolean;
 begin
 begin
   Result:=Pos(AChar^,fSpaceChars)>0;
   Result:=Pos(AChar^,fSpaceChars)>0;
@@ -2793,7 +2842,7 @@ function TRegExpr.regrepeat (p : PRegExprChar; AMax : PtrInt) : PtrInt;
     {$IFNDEF UseSetOfChar} //###0.929
     {$IFNDEF UseSetOfChar} //###0.929
     ANYLETTER:
     ANYLETTER:
       while (Result < TheMax) and
       while (Result < TheMax) and
-       (Pos (scan^, fWordChars) > 0) //###0.940
+         IsWordChar(scan^) //###0.940
      {  ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
      {  ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
        or (scan^ >= 'A') and (scan^ <= 'Z') or (scan^ = '_'))} do begin
        or (scan^ >= 'A') and (scan^ <= 'Z') or (scan^ = '_'))} do begin
         inc (Result);
         inc (Result);
@@ -2801,7 +2850,7 @@ function TRegExpr.regrepeat (p : PRegExprChar; AMax : PtrInt) : PtrInt;
        end;
        end;
     NOTLETTER:
     NOTLETTER:
       while (Result < TheMax) and
       while (Result < TheMax) and
-       (Pos (scan^, fWordChars) <= 0)  //###0.940
+         not IsWordChar(scan^)  //###0.940
      {   not ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
      {   not ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
          or (scan^ >= 'A') and (scan^ <= 'Z')
          or (scan^ >= 'A') and (scan^ <= 'Z')
          or (scan^ = '_'))} do begin
          or (scan^ = '_'))} do begin
@@ -2933,11 +2982,11 @@ function TRegExpr.MatchPrim (prog : PRegExprChar) : boolean;
          BOUND:
          BOUND:
          if (scan^ = BOUND)
          if (scan^ = BOUND)
           xor (
           xor (
-          ((reginput = fInputStart) or (Pos ((reginput - 1)^, fWordChars) <= 0))
-            and (reginput^ <> #0) and (Pos (reginput^, fWordChars) > 0)
+          ((reginput = fInputStart) or not IsWordChar((reginput - 1)^))
+            and (reginput^ <> #0) and IsWordChar(reginput^)
            or
            or
-            (reginput <> fInputStart) and (Pos ((reginput - 1)^, fWordChars) > 0)
-            and ((reginput^ = #0) or (Pos (reginput^, fWordChars) <= 0)))
+            (reginput <> fInputStart) and IsWordChar((reginput - 1)^)
+            and ((reginput^ = #0) or not IsWordChar(reginput^)))
           then EXIT;
           then EXIT;
 
 
          BOL: if reginput <> fInputStart
          BOL: if reginput <> fInputStart
@@ -3006,12 +3055,12 @@ function TRegExpr.MatchPrim (prog : PRegExprChar) : boolean;
            end;
            end;
          {$IFNDEF UseSetOfChar} //###0.929
          {$IFNDEF UseSetOfChar} //###0.929
          ANYLETTER: begin
          ANYLETTER: begin
-            if (reginput^ = #0) or (Pos (reginput^, fWordChars) <= 0) //###0.943
+            if (reginput^ = #0) or not IsWordChar(reginput^) //###0.943
              then EXIT;
              then EXIT;
             inc (reginput);
             inc (reginput);
            end;
            end;
          NOTLETTER: begin
          NOTLETTER: begin
-            if (reginput^ = #0) or (Pos (reginput^, fWordChars) > 0) //###0.943
+            if (reginput^ = #0) or IsWordChar(reginput^) //###0.943
              then EXIT;
              then EXIT;
             inc (reginput);
             inc (reginput);
            end;
            end;