Browse Source

pastojs: fixed UTF-16 chars

git-svn-id: trunk@47831 -
Mattias Gaertner 4 years ago
parent
commit
f0961c17c8
2 changed files with 79 additions and 10 deletions
  1. 7 7
      packages/pastojs/src/fppas2js.pp
  2. 72 3
      packages/pastojs/tests/tcmodules.pas

+ 7 - 7
packages/pastojs/src/fppas2js.pp

@@ -6198,7 +6198,8 @@ begin
   cInterfaceToString:=cTypeConversion+1;
   cInterfaceToString:=cTypeConversion+1;
 
 
   {$IFDEF FPC_HAS_CPSTRING}
   {$IFDEF FPC_HAS_CPSTRING}
-  ExprEvaluator.DefaultStringCodePage:=CP_UTF8;
+  ExprEvaluator.DefaultSourceCodePage:=CP_UTF8;
+  ExprEvaluator.DefaultStringCodePage:=CP_UTF16;
   {$ENDIF}
   {$ENDIF}
   FExternalNames:=TPasResHashList.Create;
   FExternalNames:=TPasResHashList.Create;
   StoreSrcColumns:=true;
   StoreSrcColumns:=true;
@@ -6514,10 +6515,10 @@ function TPas2JSResolver.ExtractPasStringLiteral(El: TPasElement;
   S is a Pascal string literal e.g. 'Line'#10
   S is a Pascal string literal e.g. 'Line'#10
     ''  empty string
     ''  empty string
     '''' => "'"
     '''' => "'"
-    #decimal   #0..255 is UTF-8 byte, #01..0255 is UTF-16, #256+ is UTF-16
-    #$hex      #$0..$ff is UTF-8 byte, #$01..$0FF is UTF-16, #$100+ is UTF-16
+    #decimal
+    #$hex
     ^l  l is a letter a-z
     ^l  l is a letter a-z
-    Invalid UTF-8 sequences give an error
+    Note that invalid UTF-8 sequences are checked by the scanner
 }
 }
 var
 var
   p, StartP, i, l: integer;
   p, StartP, i, l: integer;
@@ -6563,7 +6564,7 @@ begin
       end;
       end;
     '#':
     '#':
       begin
       begin
-      // byte or word sequence
+      // word sequence
       inc(p);
       inc(p);
       if p>l then
       if p>l then
         RaiseInternalError(20170207155121);
         RaiseInternalError(20170207155121);
@@ -6588,7 +6589,6 @@ begin
           end;
           end;
         if p=StartP then
         if p=StartP then
           RaiseInternalError(20170207164956);
           RaiseInternalError(20170207164956);
-        Result:=Result+CodePointToJSString(i);
         end
         end
       else
       else
         begin
         begin
@@ -6608,8 +6608,8 @@ begin
           end;
           end;
         if p=StartP then
         if p=StartP then
           RaiseInternalError(20170207171148);
           RaiseInternalError(20170207171148);
-        Result:=Result+CodePointToJSString(i);
         end;
         end;
+      Result:=Result+CodePointToJSString(i);
       end;
       end;
     '^':
     '^':
       begin
       begin

+ 72 - 3
packages/pastojs/tests/tcmodules.pas

@@ -282,6 +282,7 @@ type
     Procedure TestChar_Compare;
     Procedure TestChar_Compare;
     Procedure TestChar_BuiltInProcs;
     Procedure TestChar_BuiltInProcs;
     Procedure TestStringConst;
     Procedure TestStringConst;
+    Procedure TestStringConst_InvalidUTF16;
     Procedure TestStringConstSurrogate;
     Procedure TestStringConstSurrogate;
     Procedure TestString_Length;
     Procedure TestString_Length;
     Procedure TestString_Compare;
     Procedure TestString_Compare;
@@ -983,6 +984,28 @@ var
     end;
     end;
   end;
   end;
 
 
+  function HasSpecialChar(s: string): boolean;
+  var
+    i: Integer;
+  begin
+    for i:=1 to length(s) do
+      if s[i] in [#0..#31,#127..#255] then
+        exit(true);
+    Result:=false;
+  end;
+
+  function HashSpecialChars(s: string): string;
+  var
+    i: Integer;
+  begin
+    Result:='';
+    for i:=1 to length(s) do
+      if s[i] in [#0..#31,#127..#255] then
+        Result:=Result+'#'+hexstr(ord(s[i]),2)
+      else
+        Result:=Result+s[i];
+  end;
+
   procedure DiffFound;
   procedure DiffFound;
   var
   var
     ActLineStartP, ActLineEndP, p, StartPos: PChar;
     ActLineStartP, ActLineEndP, p, StartPos: PChar;
@@ -1011,8 +1034,12 @@ var
         ActLineEndP:=FindLineEnd(ActualP);
         ActLineEndP:=FindLineEnd(ActualP);
         ActLine:=copy(Actual,ActLineStartP-PChar(Actual)+1,ActLineEndP-ActLineStartP);
         ActLine:=copy(Actual,ActLineStartP-PChar(Actual)+1,ActLineEndP-ActLineStartP);
         writeln('- ',ActLine);
         writeln('- ',ActLine);
+        if HasSpecialChar(ActLine) then
+          writeln('- ',HashSpecialChars(ActLine));
         // write expected line
         // write expected line
         writeln('+ ',ExpLine);
         writeln('+ ',ExpLine);
+        if HasSpecialChar(ExpLine) then
+          writeln('- ',HashSpecialChars(ExpLine));
         // write empty line with pointer ^
         // write empty line with pointer ^
         for i:=1 to 2+ExpectedP-StartPos do write(' ');
         for i:=1 to 2+ExpectedP-StartPos do write(' ');
         writeln('^');
         writeln('^');
@@ -7609,11 +7636,16 @@ begin
   '  s:=''"''''"'';',
   '  s:=''"''''"'';',
   '  s:=#$20AC;', // euro
   '  s:=#$20AC;', // euro
   '  s:=#$10437;', // outside BMP
   '  s:=#$10437;', // outside BMP
-  //'  s:=#$F0#$90#$90#$B7;', // as UTF-8
+  '  s:=''abc''#$20AC;', // ascii,#
+  '  s:=''ä''#$20AC;', // non ascii,#
+  '  s:=#$20AC''abc'';', // #, ascii
+  '  s:=#$20AC''ä'';', // #, non ascii
   '  s:=default(string);',
   '  s:=default(string);',
   '  s:=concat(s);',
   '  s:=concat(s);',
   '  s:=concat(s,''a'',s);',
   '  s:=concat(s,''a'',s);',
-  //'  s:=#0250#269;',
+  '  s:=#250#269;',
+  //'  s:=#$2F804;',
+  // ToDo: \uD87E\uDC04 -> \u{2F804}
   '']);
   '']);
   ConvertProgram;
   ConvertProgram;
   CheckSource('TestStringConst',
   CheckSource('TestStringConst',
@@ -7635,10 +7667,47 @@ begin
     '$mod.s=''"\''"'';',
     '$mod.s=''"\''"'';',
     '$mod.s="€";',
     '$mod.s="€";',
     '$mod.s="'#$F0#$90#$90#$B7'";',
     '$mod.s="'#$F0#$90#$90#$B7'";',
-    //'$mod.s="'#$F0#$90#$90#$B7'";',
+    '$mod.s = "abc€";',
+    '$mod.s = "ä€";',
+    '$mod.s = "€abc";',
+    '$mod.s = "ۊ";',
     '$mod.s="";',
     '$mod.s="";',
     '$mod.s = $mod.s;',
     '$mod.s = $mod.s;',
     '$mod.s = $mod.s.concat("a", $mod.s);',
     '$mod.s = $mod.s.concat("a", $mod.s);',
+    '$mod.s = "úč";',
+    '']));
+end;
+
+procedure TTestModule.TestStringConst_InvalidUTF16;
+begin
+  StartProgram(false);
+  Add([
+  'const',
+  '  a: char = #$D87E;',
+  '  b: string = #$D87E;',
+  '  c: string = #$D87E#43;',
+  'begin',
+  '  c:=''abc''#$D87E;',
+  '  c:=#0#1#2;',
+  '  c:=#127;',
+  '  c:=#128;',
+  '  c:=#255;',
+  '  c:=#256;',
+  '']);
+  ConvertProgram;
+  CheckSource('TestStringConst',
+    LinesToStr([
+    'this.a = "\uD87E";',
+    'this.b = "\uD87E";',
+    'this.c = "\uD87E+";',
+    '']),
+    LinesToStr([
+    '$mod.c = "abc\uD87E";',
+    '$mod.c = "\x00\x01\x02";',
+    '$mod.c = "'#127'";',
+    '$mod.c = "'#$c2#$80'";',
+    '$mod.c = "'#$c3#$BF'";',
+    '$mod.c = "'#$c4#$80'";',
     '']));
     '']));
 end;
 end;