Browse Source

fcl-passrc: resolver: split source and target codepage

git-svn-id: trunk@47830 -
Mattias Gaertner 4 years ago
parent
commit
fe4fda2a35

+ 255 - 91
packages/fcl-passrc/src/pasresolveeval.pas

@@ -564,6 +564,7 @@ type
   TResEvalString = class(TResEvalValue)
   TResEvalString = class(TResEvalValue)
   public
   public
     S: RawByteString;
     S: RawByteString;
+    OnlyASCII: boolean;
     constructor Create; override;
     constructor Create; override;
     constructor CreateValue(const aValue: RawByteString);
     constructor CreateValue(const aValue: RawByteString);
     function Clone: TResEvalValue; override;
     function Clone: TResEvalValue; override;
@@ -692,7 +693,8 @@ type
   private
   private
     FAllowedInts: TResEvalTypedInts;
     FAllowedInts: TResEvalTypedInts;
     {$ifdef FPC_HAS_CPSTRING}
     {$ifdef FPC_HAS_CPSTRING}
-    FDefaultEncoding: TSystemCodePage;
+    FDefaultSourceEncoding: TSystemCodePage;
+    FDefaultStringEncoding: TSystemCodePage;
     {$endif}
     {$endif}
     FOnEvalIdentifier: TPasResEvalIdentHandler;
     FOnEvalIdentifier: TPasResEvalIdentHandler;
     FOnEvalParams: TPasResEvalParamsHandler;
     FOnEvalParams: TPasResEvalParamsHandler;
@@ -779,6 +781,8 @@ type
     function GetUTF8Str(const s: RawByteString; ErrorEl: TPasElement): String;
     function GetUTF8Str(const s: RawByteString; ErrorEl: TPasElement): String;
     function GetUnicodeStr(const s: RawByteString; ErrorEl: TPasElement): UnicodeString;
     function GetUnicodeStr(const s: RawByteString; ErrorEl: TPasElement): UnicodeString;
     function GetWideChar(const s: RawByteString; out w: WideChar): boolean;
     function GetWideChar(const s: RawByteString; out w: WideChar): boolean;
+    function GetExprStringTargetCP(Expr: TPasExpr): TSystemCodePage; virtual; // e.g. var s: String(1234) = 'ä' return 1234
+    function GetExprStringSourceCP(Expr: TPasExpr): TSystemCodePage; virtual; // e.g. {$codepage 123}var s: String = 'ä' return 123
     {$endif}
     {$endif}
     property OnLog: TPasResEvalLogHandler read FOnLog write FOnLog;
     property OnLog: TPasResEvalLogHandler read FOnLog write FOnLog;
     property OnEvalIdentifier: TPasResEvalIdentHandler read FOnEvalIdentifier write FOnEvalIdentifier;
     property OnEvalIdentifier: TPasResEvalIdentHandler read FOnEvalIdentifier write FOnEvalIdentifier;
@@ -786,7 +790,8 @@ type
     property OnRangeCheckEl: TPasResEvalRangeCheckElHandler read FOnRangeCheckEl write FOnRangeCheckEl;
     property OnRangeCheckEl: TPasResEvalRangeCheckElHandler read FOnRangeCheckEl write FOnRangeCheckEl;
     property AllowedInts: TResEvalTypedInts read FAllowedInts write FAllowedInts;
     property AllowedInts: TResEvalTypedInts read FAllowedInts write FAllowedInts;
     {$ifdef FPC_HAS_CPSTRING}
     {$ifdef FPC_HAS_CPSTRING}
-    property DefaultStringCodePage: TSystemCodePage read FDefaultEncoding write FDefaultEncoding;
+    property DefaultSourceCodePage: TSystemCodePage read FDefaultSourceEncoding write FDefaultSourceEncoding;
+    property DefaultStringCodePage: TSystemCodePage read FDefaultStringEncoding write FDefaultStringEncoding;
     {$endif}
     {$endif}
   end;
   end;
   TResExprEvaluatorClass = class of TResExprEvaluator;
   TResExprEvaluatorClass = class of TResExprEvaluator;
@@ -4126,15 +4131,22 @@ end;
 
 
 function TResExprEvaluator.EvalPrimitiveExprString(Expr: TPrimitiveExpr
 function TResExprEvaluator.EvalPrimitiveExprString(Expr: TPrimitiveExpr
   ): TResEvalValue;
   ): TResEvalValue;
-{ Extracts the value from a Pascal string literal
-
-  S is a Pascal string literal e.g. 'Line'#10
-    ''  empty string
-    '''' => "'"
-    #decimal
-    #$hex
-    ^l  l is a letter a-z
-}
+ //Extracts the value from a Pascal string literal
+ //
+ // S is a Pascal string literal e.g. 'Line'#10
+ //   ''  empty string
+ //   '''' => "'"
+ //   #decimal
+ //   #$hex
+ //   ^l  l is a letter a-z
+ //
+ // Codepage:
+ //   For example {$codepage utf8}var s: AnsiString(CP_1251) = 'a';
+ //     Source codepage is CP_UTF8, target codepage is CP_1251
+ //
+ //   Source codepage is needed for reading non ASCII string literals 'ä'.
+ //   Target codepage is needed for reading non ASCII # literals.
+ //   Target codepage costs time to compute.
 
 
   procedure RangeError(id: TMaxPrecInt);
   procedure RangeError(id: TMaxPrecInt);
   begin
   begin
@@ -4142,24 +4154,36 @@ function TResExprEvaluator.EvalPrimitiveExprString(Expr: TPrimitiveExpr
     RaiseRangeCheck(id,Expr);
     RaiseRangeCheck(id,Expr);
   end;
   end;
 
 
-  procedure Add(h: String);
+{$IFDEF FPC_HAS_CPSTRING}
+var
+  TargetCPValid: boolean;
+  TargetCP: word;
+  SourceCPValid: boolean;
+  SourceCP: word;
+
+  procedure FetchSourceCP;
   begin
   begin
-    {$ifdef FPC_HAS_CPSTRING}
-    if Result.Kind=revkString then
-      TResEvalString(Result).S:=TResEvalString(Result).S+h
-    else
-      TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+GetUnicodeStr(h,Expr);
-    {$else}
-    TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+h;
-    {$endif}
+    if SourceCPValid then exit;
+    SourceCP:=GetExprStringSourceCP(Expr);
+    if SourceCP=DefaultSystemCodePage then
+      SourceCP:=CP_ACP;
+    SourceCPValid:=true;
   end;
   end;
 
 
-  procedure AddHash(u: longword; ForceUTF16: boolean);
-  {$ifdef FPC_HAS_CPSTRING}
+  procedure FetchTargetCP;
+  begin
+    if TargetCPValid then exit;
+    TargetCP:=GetExprStringTargetCP(Expr);
+    if TargetCP=DefaultSystemCodePage then
+      TargetCP:=CP_ACP;
+    TargetCPValid:=true;
+  end;
+
+  procedure ForceUTF16;
   var
   var
     h: RawByteString;
     h: RawByteString;
   begin
   begin
-    if ((u>255) or ForceUTF16) and (Result.Kind=revkString) then
+    if Result.Kind=revkString then
       begin
       begin
       // switch to unicodestring
       // switch to unicodestring
       h:=TResEvalString(Result).S;
       h:=TResEvalString(Result).S;
@@ -4167,22 +4191,196 @@ function TResExprEvaluator.EvalPrimitiveExprString(Expr: TPrimitiveExpr
       Result:=nil; // in case of exception in GetUnicodeStr
       Result:=nil; // in case of exception in GetUnicodeStr
       Result:=TResEvalUTF16.CreateValue(GetUnicodeStr(h,Expr));
       Result:=TResEvalUTF16.CreateValue(GetUnicodeStr(h,Expr));
       end;
       end;
+  end;
+{$ENDIF}
+
+  procedure AddSrc(h: String);
+  {$ifdef FPC_HAS_CPSTRING}
+  var
+    Value: TResEvalString;
+    OnlyASCII: Boolean;
+    i: Integer;
+  {$ENDIF}
+  begin
+    if h='' then exit;
+    {$ifdef FPC_HAS_CPSTRING}
+    OnlyASCII:=true;
+    for i:=1 to length(h) do
+      if ord(h[i])>127 then
+        begin
+        // append non ASCII -> needs codepage
+        OnlyASCII:=false;
+        FetchSourceCP;
+        SetCodePage(rawbytestring(h),SourceCP,false);
+        break;
+        end;
+
     if Result.Kind=revkString then
     if Result.Kind=revkString then
-      TResEvalString(Result).S:=TResEvalString(Result).S+Chr(u)
+      begin
+      Value:=TResEvalString(Result);
+      if OnlyASCII and Value.OnlyASCII then
+        begin
+        // concatenate ascii strings
+        Value.S:=Value.S+h;
+        exit;
+        end;
+
+      // concatenate non ascii strings
+      FetchTargetCP;
+      case TargetCP of
+      CP_UTF16:
+        begin
+        ForceUTF16;
+        TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+GetUnicodeStr(h,Expr);
+        //writeln('AddSrc len(h)=',length(h),' StringCodePage=',StringCodePage(h),' GetCodePage=',GetCodePage(h),' S=',length(TResEvalUTF16(Result).S));
+        end;
+      CP_UTF16BE:
+        RaiseNotYetImplemented(20201220222608,Expr);
+      else
+        begin
+        if Value.OnlyASCII and (Value.S<>'') then
+          SetCodePage(Value.S,TargetCP,false);
+        Value.S:=Value.S+h;
+        end;
+      end;
+
+      end
+    else
+      TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+GetUnicodeStr(h,Expr);
+    {$else}
+    TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+h;
+    {$endif}
+  end;
+
+  procedure AddHash(u: longword);
+  {$ifdef FPC_HAS_CPSTRING}
+  begin
+    if Result.Kind=revkString then
+      TResEvalString(Result).s:=TResEvalString(Result).S+Chr(u)
     else
     else
       TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+WideChar(u);
       TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+WideChar(u);
   end;
   end;
   {$else}
   {$else}
   begin
   begin
     TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+WideChar(u);
     TResEvalUTF16(Result).S:=TResEvalUTF16(Result).S+WideChar(u);
-    if ForceUTF16 then ;
   end;
   end;
   {$endif}
   {$endif}
 
 
+  function ReadHash(Value: TResEvalValue; const S: string; p, l: integer): integer;
+  var
+    StartP: Integer;
+    u: longword;
+    c: Char;
+    {$ifdef FPC_HAS_CPSTRING}
+    ValueAnsi: TResEvalString;
+    ValueUTF16: TResEvalUTF16;
+    OldCP: TSystemCodePage;
+    {$ENDIF}
+  begin
+    Result:=p;
+    inc(Result);
+    if Result>l then
+      RaiseInternalError(20181016121354); // error in scanner
+    if S[Result]='$' then
+      begin
+      // #$hexnumber
+      inc(Result);
+      StartP:=Result;
+      u:=0;
+      while Result<=l do
+        begin
+        c:=S[Result];
+        case c of
+        '0'..'9': u:=u*16+longword(ord(c)-ord('0'));
+        'a'..'f': u:=u*16+longword(ord(c)-ord('a'))+10;
+        'A'..'F': u:=u*16+longword(ord(c)-ord('A'))+10;
+        else break;
+        end;
+        if u>$10FFFF then
+          RangeError(20170523115712);
+        inc(Result);
+        end;
+      end
+    else
+      begin
+      // #decimalnumber
+      StartP:=Result;
+      u:=0;
+      while Result<=l do
+        begin
+        c:=S[Result];
+        case c of
+        '0'..'9': u:=u*10+longword(ord(c)-ord('0'));
+        else break;
+        end;
+        if u>$ffff then
+          RangeError(20170523123137);
+        inc(Result);
+        end;
+      end;
+    if Result=StartP then
+      RaiseInternalError(20170523123806);
+    {$IFDEF FPC_HAS_CPSTRING}
+    if u<128 then
+      begin
+      // ASCII
+      AddHash(u);
+      exit;
+      end;
+    // non ASCII
+    FetchTargetCP;
+    if (TargetCP=CP_UTF16) or (TargetCP=CP_UTF16BE) or (u>255) then
+      begin
+      ForceUTF16;
+      ValueUTF16:=TResEvalUTF16(Value);
+      if u>$ffff then
+        begin
+        // split into two
+        dec(u,$10000);
+        ValueUTF16.S:=ValueUTF16.S+WideChar($D800+(u shr 10));
+        ValueUTF16.S:=ValueUTF16.S+WideChar($DC00+(u and $3ff));
+        end
+      else
+        ValueUTF16.S:=ValueUTF16.S+WideChar(u);
+      if TargetCP=CP_UTF16BE then
+        RaiseNotYetImplemented(20201220212206,Expr);
+      end
+    else
+      begin
+      // byte encoding
+      ValueAnsi:=TResEvalString(Value);
+      if ValueAnsi.S<>'' then
+        begin
+        // append
+        OldCP:=StringCodePage(ValueAnsi.S);
+        if OldCP<>TargetCP then
+          SetCodePage(ValueAnsi.S,TargetCP,false);
+        ValueAnsi.S:=ValueAnsi.S+Chr(u);
+        end
+      else
+        begin
+        // start
+        ValueAnsi.S:=Chr(u);
+        SetCodePage(ValueAnsi.S,TargetCP,false);
+        end;
+      ValueAnsi.OnlyASCII:=false;
+      end;
+    {$ELSE}
+    if u>$ffff then
+      begin
+      // split into two
+      dec(u,$10000);
+      AddHash($D800+(u shr 10));
+      AddHash($DC00+(u and $3ff));
+      end
+    else
+      AddHash(u);
+    {$ENDIF}
+  end;
+
 var
 var
   p, StartP, l: integer;
   p, StartP, l: integer;
   c: Char;
   c: Char;
-  u: longword;
   S: String;
   S: String;
 begin
 begin
   Result:=nil;
   Result:=nil;
@@ -4194,6 +4392,10 @@ begin
   if l=0 then
   if l=0 then
     RaiseInternalError(20170523113809);
     RaiseInternalError(20170523113809);
   {$ifdef FPC_HAS_CPSTRING}
   {$ifdef FPC_HAS_CPSTRING}
+  TargetCPValid:=false;
+  TargetCP:=CP_ACP;
+  SourceCPValid:=false;
+  SourceCP:=CP_ACP;
   Result:=TResEvalString.Create;
   Result:=TResEvalString.Create;
   {$else}
   {$else}
   Result:=TResEvalUTF16.Create;
   Result:=TResEvalUTF16.Create;
@@ -4216,12 +4418,12 @@ begin
         '''':
         '''':
           begin
           begin
           if p>StartP then
           if p>StartP then
-            Add(copy(S,StartP,p-StartP));
+            AddSrc(copy(S,StartP,p-StartP));
           inc(p);
           inc(p);
           StartP:=p;
           StartP:=p;
           if (p>l) or (S[p]<>'''') then
           if (p>l) or (S[p]<>'''') then
             break;
             break;
-          Add('''');
+          AddSrc('''');
           inc(p);
           inc(p);
           StartP:=p;
           StartP:=p;
           end;
           end;
@@ -4230,65 +4432,10 @@ begin
         end;
         end;
       until false;
       until false;
       if p>StartP then
       if p>StartP then
-        Add(copy(S,StartP,p-StartP));
+        AddSrc(copy(S,StartP,p-StartP));
       end;
       end;
     '#':
     '#':
-      begin
-      inc(p);
-      if p>l then
-        RaiseInternalError(20181016121354);
-      if S[p]='$' then
-        begin
-        // #$hexnumber
-        inc(p);
-        StartP:=p;
-        u:=0;
-        while p<=l do
-          begin
-          c:=S[p];
-          case c of
-          '0'..'9': u:=u*16+longword(ord(c)-ord('0'));
-          'a'..'f': u:=u*16+longword(ord(c)-ord('a'))+10;
-          'A'..'F': u:=u*16+longword(ord(c)-ord('A'))+10;
-          else break;
-          end;
-          if u>$10FFFF then
-            RangeError(20170523115712);
-          inc(p);
-          end;
-        if p=StartP then
-          RaiseInternalError(20170207164956);
-        if u>$ffff then
-          begin
-          // split into two
-          dec(u,$10000);
-          AddHash($D800+(u shr 10),true);
-          AddHash($DC00+(u and $3ff),true);
-          end
-        else
-          AddHash(u,p-StartP>2);
-        end
-      else
-        begin
-        // #decimalnumber
-        StartP:=p;
-        u:=0;
-        while p<=l do
-          begin
-          c:=S[p];
-          case c of
-          '0'..'9': u:=u*10+longword(ord(c)-ord('0'));
-          else break;
-          end;
-          if u>$ffff then
-            RangeError(20170523123137);
-          inc(p);
-          end;
-        if p=StartP then
-          RaiseInternalError(20170523123806);
-        AddHash(u,(S[StartP]='0') and (u>0));
-        end;
-      end;
+      p:=ReadHash(Result,S,p,l);
     '^':
     '^':
       begin
       begin
       // ^A is #1
       // ^A is #1
@@ -4297,8 +4444,8 @@ begin
         RaiseInternalError(20181016121520);
         RaiseInternalError(20181016121520);
       c:=S[p];
       c:=S[p];
       case c of
       case c of
-      'a'..'z': AddHash(ord(c)-ord('a')+1,false);
-      'A'..'Z': AddHash(ord(c)-ord('A')+1,false);
+      'a'..'z': AddHash(ord(c)-ord('a')+1);
+      'A'..'Z': AddHash(ord(c)-ord('A')+1);
       else RaiseInternalError(20170523123809);
       else RaiseInternalError(20170523123809);
       end;
       end;
       inc(p);
       inc(p);
@@ -4324,7 +4471,8 @@ begin
   inherited Create;
   inherited Create;
   FAllowedInts:=ReitDefaults;
   FAllowedInts:=ReitDefaults;
   {$ifdef FPC_HAS_CPSTRING}
   {$ifdef FPC_HAS_CPSTRING}
-  FDefaultEncoding:=CP_ACP;
+  FDefaultSourceEncoding:=system.DefaultSystemCodePage;
+  FDefaultStringEncoding:=CP_ACP;
   {$endif}
   {$endif}
 end;
 end;
 
 
@@ -5116,11 +5264,11 @@ end;
 
 
 function TResExprEvaluator.GetCodePage(const s: RawByteString): TSystemCodePage;
 function TResExprEvaluator.GetCodePage(const s: RawByteString): TSystemCodePage;
 begin
 begin
-  if s='' then exit(DefaultStringCodePage);
+  if s='' then exit(DefaultSourceCodePage);
   Result:=StringCodePage(s);
   Result:=StringCodePage(s);
   if (Result=CP_ACP) or (Result=CP_NONE) then
   if (Result=CP_ACP) or (Result=CP_NONE) then
     begin
     begin
-    Result:=DefaultStringCodePage;
+    Result:=DefaultSourceCodePage;
     if (Result=CP_ACP) or (Result=CP_NONE) then
     if (Result=CP_ACP) or (Result=CP_NONE) then
       begin
       begin
       Result:=System.DefaultSystemCodePage;
       Result:=System.DefaultSystemCodePage;
@@ -5182,7 +5330,7 @@ var
 begin
 begin
   if s='' then exit('');
   if s='' then exit('');
   CP:=GetCodePage(s);
   CP:=GetCodePage(s);
-  if CP=CP_UTF8 then
+  if (CP=CP_UTF8) or ((CP=CP_ACP) and (DefaultSystemCodePage=CP_UTF8)) then
     begin
     begin
     if ErrorEl<>nil then
     if ErrorEl<>nil then
       CheckValidUTF8(s,ErrorEl);
       CheckValidUTF8(s,ErrorEl);
@@ -5217,6 +5365,20 @@ begin
     Result:=true;
     Result:=true;
     end;
     end;
 end;
 end;
+
+function TResExprEvaluator.GetExprStringTargetCP(Expr: TPasExpr
+  ): TSystemCodePage;
+begin
+  Result:=DefaultStringCodePage;
+  if Expr=nil then ;
+end;
+
+function TResExprEvaluator.GetExprStringSourceCP(Expr: TPasExpr
+  ): TSystemCodePage;
+begin
+  Result:=DefaultSourceCodePage;
+  if Expr=nil then ;
+end;
 {$endif}
 {$endif}
 
 
 procedure TResExprEvaluator.PredBool(Value: TResEvalBool; ErrorEl: TPasElement);
 procedure TResExprEvaluator.PredBool(Value: TResEvalBool; ErrorEl: TPasElement);
@@ -5565,6 +5727,7 @@ end;
 constructor TResEvalString.Create;
 constructor TResEvalString.Create;
 begin
 begin
   inherited Create;
   inherited Create;
+  OnlyASCII:=true;
   Kind:=revkString;
   Kind:=revkString;
 end;
 end;
 
 
@@ -5578,6 +5741,7 @@ function TResEvalString.Clone: TResEvalValue;
 begin
 begin
   Result:=inherited Clone;
   Result:=inherited Clone;
   TResEvalString(Result).S:=S;
   TResEvalString(Result).S:=S;
+  TResEvalString(Result).OnlyASCII:=OnlyASCII;
 end;
 end;
 
 
 function TResEvalString.AsString: string;
 function TResEvalString.AsString: string;

+ 1 - 1
packages/fcl-passrc/src/pasresolver.pp

@@ -15704,7 +15704,7 @@ begin
       end;
       end;
     {$endif}
     {$endif}
     revkUnicodeString:
     revkUnicodeString:
-      if length(TResEvalUTF16(Value).S)=1 then
+      if (length(TResEvalUTF16(Value).S)=1) and (bt in btAllChars) then
         begin
         begin
         w:=TResEvalUTF16(Value).S[1];
         w:=TResEvalUTF16(Value).S[1];
         {$ifdef FPC_HAS_CPSTRING}
         {$ifdef FPC_HAS_CPSTRING}

+ 4 - 3
packages/fcl-passrc/tests/tcresolver.pas

@@ -299,7 +299,7 @@ type
     Procedure TestIntegerBoolFail;
     Procedure TestIntegerBoolFail;
     Procedure TestBooleanOperators;
     Procedure TestBooleanOperators;
     Procedure TestStringOperators;
     Procedure TestStringOperators;
-    Procedure TestWideCharOperators;
+    Procedure TestWideCharOperators_DelphiUnicode;
     Procedure TestFloatOperators;
     Procedure TestFloatOperators;
     Procedure TestCAssignments;
     Procedure TestCAssignments;
     Procedure TestTypeCastBaseTypes;
     Procedure TestTypeCastBaseTypes;
@@ -2181,6 +2181,7 @@ begin
   Result.OnFindUnit:=@OnPasResolverFindUnit;
   Result.OnFindUnit:=@OnPasResolverFindUnit;
   Result.OnLog:=@OnPasResolverLog;
   Result.OnLog:=@OnPasResolverLog;
   Result.Hub:=Hub;
   Result.Hub:=Hub;
+  Result.ExprEvaluator.DefaultSourceCodePage:=CP_UTF8;
   FModules.Add(Result);
   FModules.Add(Result);
 end;
 end;
 
 
@@ -4678,9 +4679,9 @@ begin
   ParseProgram;
   ParseProgram;
 end;
 end;
 
 
-procedure TTestResolver.TestWideCharOperators;
+procedure TTestResolver.TestWideCharOperators_DelphiUnicode;
 begin
 begin
-  ResolverEngine.ExprEvaluator.DefaultStringCodePage:=CP_UTF8;
+  ResolverEngine.ExprEvaluator.DefaultStringCodePage:=CP_UTF16;
   ResolverEngine.BaseTypeChar:=btWideChar;
   ResolverEngine.BaseTypeChar:=btWideChar;
   ResolverEngine.BaseTypeString:=btUnicodeString;
   ResolverEngine.BaseTypeString:=btUnicodeString;
   StartProgram(false);
   StartProgram(false);