ソースを参照

* made utf8tostring() Delphi-compatible (mantis #29585):
o removed utf8string overload
o always ignore any code page information from the input, and interpret the
contents of the input directly as utf8-encoded bytes
* made utf8tostring() compatible with the JVM backend (mantis #29497)

git-svn-id: trunk@33159 -

Jonas Maebe 9 年 前
コミット
a100309350

+ 2 - 0
.gitattributes

@@ -11635,6 +11635,7 @@ tests/test/jvm/tvirtclmeth.pp svneol=native#text/plain
 tests/test/jvm/tw20212.pp svneol=native#text/plain
 tests/test/jvm/tw22807.pp svneol=native#text/plain
 tests/test/jvm/tw24089.pp svneol=native#text/plain
+tests/test/jvm/tw29585.pp svneol=native#text/plain
 tests/test/jvm/twith.pp svneol=native#text/plain
 tests/test/jvm/uenum.pp svneol=native#text/plain
 tests/test/jvm/ujsetter.pp svneol=native#text/plain
@@ -14939,6 +14940,7 @@ tests/webtbs/tw2953.pp svneol=native#text/plain
 tests/webtbs/tw29546.pp svneol=native#text/pascal
 tests/webtbs/tw2956.pp svneol=native#text/plain
 tests/webtbs/tw2958.pp svneol=native#text/plain
+tests/webtbs/tw29585.pp svneol=native#text/plain
 tests/webtbs/tw29609.pp svneol=native#text/pascal
 tests/webtbs/tw2966.pp svneol=native#text/plain
 tests/webtbs/tw2975.pp svneol=native#text/plain

+ 5 - 2
rtl/inc/ustringh.inc

@@ -136,12 +136,15 @@ function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar
 function UTF8Encode(const s : RawByteString) : RawByteString; inline;
 function UTF8Encode(const s : UnicodeString) : RawByteString;
 function UTF8Decode(const s : RawByteString): UnicodeString;
-function UTF8ToString(const s : UTF8String): UnicodeString;inline;
 function UTF8ToString(const s : RawByteString): UnicodeString;inline;
 function UTF8ToString(const S: ShortString): unicodestring;
 function UTF8ToString(const S: PAnsiChar): unicodestring;
+{ byte and ansichar are the same on the JVM, and "array of" and "pointer to"
+  are as well }
+{$ifndef CPUJVM}
 function UTF8ToString(const S: array of AnsiChar): unicodestring;
-function UTF8ToString(const S: array of Byte): unicodestring; 
+function UTF8ToString(const S: array of Byte): unicodestring;
+{$endif not CPUJVM}
 function AnsiToUtf8(const s : RawByteString): RawByteString;{$ifdef SYSTEMINLINE}inline;{$endif}
 function Utf8ToAnsi(const s : RawByteString) : RawByteString;{$ifdef SYSTEMINLINE}inline;{$endif}
 function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String;

+ 23 - 29
rtl/inc/ustrings.inc

@@ -2362,63 +2362,57 @@ Begin
   SetCodePage(Result,DefaultFileSystemCodePage,True);
 End;
 
-function UTF8ToString(const S: UTF8String): UnicodeString; inline;
-begin
-  Result := UTF8Decode(S);
-end;
-
+{ Delphi compatibility: always interpret the data in the string as UTF-8,
+  ignore any codepage }
 function UTF8ToString(const S: RawByteString): UnicodeString; inline;
-
-Var
-  UTF8 : UTF8String;
-
 begin
-  UTF8:=S;
-  Result := UTF8Decode(UTF8);
+  Result := UTF8Decode(S);
 end;
 
 function UTF8ToString(const S: ShortString): UnicodeString; 
-
 Var
-  UTF8 : UTF8String;
-
+  rs: RawByteString;
 begin
-  UTF8:=S;
-  Result := UTF8Decode(UTF8);
+  rs:=S;
+  Result := UTF8Decode(rs);
 end;
 
 function UTF8ToString(const S: PAnsiChar): UnicodeString;
 var
-  UTF: UTF8String;
+  rs: RawByteString;
   Count: Integer;
 begin
-  Count := StrLen(S);
-  SetLength(UTF, Count);
+  Count := length(S);
+  SetLength(rs, Count);
   if Count > 0 then
-    Move(S^, UTF[1], Count);
-  Result := UTF8ToString(UTF);
+    fpc_pchar_ansistr_intern_charmove(S,0,rs,0,Count);
+  Result := UTF8ToString(rs);
 end;
 
+{ byte and ansichar are the same on the JVM, and "array of" and "pointer to"
+  are as well }
+{$ifndef CPUJVM}
 function UTF8ToString(const S: array of AnsiChar): UnicodeString;
 var
-  UTF: UTF8String;
+  rs: RawByteString;
   Count: Integer;
 begin
   Count := Length(S);
-  SetLength(UTF, Count);
+  SetLength(rs, Count);
   if Count > 0 then
-    Move(S[Low(S)], UTF[1], Count);
-  Result := UTF8ToString(UTF);
+    fpc_pchar_ansistr_intern_charmove(@S,Low(S),rs,0,Count);
+  Result := UTF8ToString(rs);
 end;
 
 function UTF8ToString(const S: array of Byte): UnicodeString;
 var
-  UTF: UTF8String;
+  rs: RawByteString;
   Count: Integer;
 begin
   Count := Length(S);
-  SetLength(UTF, Count);
+  SetLength(rs, Count);
   if Count > 0 then
-    Move(S[Low(S)], UTF[1], Count);
-  Result := UTF8ToString(UTF);
+    fpc_pchar_ansistr_intern_charmove(pchar(@S),Low(S),rs,0,Count);
+  Result := UTF8ToString(rs);
 end;
+{$endif not CPUJVM}

+ 4 - 0
tests/test/jvm/testall.bat

@@ -324,3 +324,7 @@ ppcjvm -O2 -g -B  -CTinitlocals tprocvaranon
 if %errorlevel% neq 0 exit /b %errorlevel%
 java -Dfile.encoding=UTF-8 -cp ..\..\..\rtl\units\jvm-java;. tprocvaranon
 if %errorlevel% neq 0 exit /b %errorlevel%
+ppcjvm -O2 -g -B  -CTinitlocals tw29585
+if %errorlevel% neq 0 exit /b %errorlevel%
+java -Dfile.encoding=UTF-8 -cp ..\..\..\rtl\units\jvm-java;. tw29585
+if %errorlevel% neq 0 exit /b %errorlevel%

+ 2 - 0
tests/test/jvm/testall.sh

@@ -189,3 +189,5 @@ $PPC -O2 -g -B -Sa tformalclass
 java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tformalclass
 $PPC -O2 -g -B -Sa tprocvaranon
 java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tprocvaranon
+$PPC -O2 -g -B -Sa tw29585
+java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tw29585

+ 218 - 0
tests/test/jvm/tw29585.pp

@@ -0,0 +1,218 @@
+program tw29585;
+{$IFDEF FPC}
+{$MODE OBJFPC}{$H+}
+{$ENDIF}
+
+{$ifdef CPUJVM}
+uses
+  {$ifdef java}jdk15{$else}androidr14{$endif};
+
+  {$macro on}
+  {$define writeln:=jlsystem.fout.println}
+  {$define write:=jlsystem.fout.print}
+{$endif}
+
+{$IFNDEF FPC}
+type
+  tsystemcodepage = word;
+{$ENDIF}
+
+Type
+  tstr1251 = type ansistring(1251);
+
+const
+  utf8data: array[0..10] of ansichar = #$C3#$A9#$C2#$BA#$C3#$AE#$C5#$93#$E2#$88#$82;
+  utf8data_in_utf16: unicodestring = #$00E9#$00BA#$00EE#$0153#$2202;
+
+  invalidutf8data: array[0..3] of ansichar = #$80#$81#$82#$83;
+  invalidutf8data_utf_16a: unicodestring = '????';
+  invalidutf8data_utf_16b: unicodestring = #$fffd#$fffd#$fffd#$fffd;
+
+
+function inttohex(l: longint; len: longint): unicodestring;
+var
+  i: longint;
+const
+  hexchars: array[0..15] of ansichar = ('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
+begin
+  result:='';
+  for i:=1 to len do
+    begin
+      result:=hexchars[l and $f]+result;
+      l:=l shr 4;
+    end;
+end;
+
+procedure error(l: longint; const u: unicodestring);
+  var
+    i: longint;
+  begin
+    write('error for test ');
+    writeln(l);
+    write('result: ');
+    for i:=low(u) to high(u) do
+      begin
+        write('#$');
+        write(inttohex(ord(u[i]),4));
+      end;
+    writeln;
+    halt(l);
+  end;
+
+
+procedure initarray(p: pbyte; const data: array of ansichar);
+  var
+    i: longint;
+  begin
+    for i:=low(data) to high(data) do
+      p[i]:=ord(data[i]);
+  end;
+
+
+procedure initstr(var s: rawbytestring; cp: tsystemcodepage; const data: array of ansichar); overload;
+  var
+    i: longint;
+  begin
+    setlength(s,length(data));
+    setcodepage(s,cp,false);
+    for i:=low(data) to high(data) do
+      s[i+1]:=data[i];
+  end;
+
+
+procedure initstr(var s: shortstring; const data: array of ansichar); overload;
+  var
+    i: longint;
+  begin
+    setlength(s,length(data));
+    for i:=low(data) to high(data) do
+      s[i+1]:=data[i];
+  end;
+
+
+procedure testvalidutf8;
+  var
+    s1251: tstr1251;
+    rs: rawbytestring;
+    utf8: utf8string;
+    s: ansistring;
+    ss: shortstring;
+    ba: array[low(utf8data)..high(utf8data)] of byte;
+    bc: array[low(utf8data)..high(utf8data)] of ansichar;
+    bcc: array[low(utf8data)..high(utf8data)+1] of ansichar;
+    w: unicodestring;
+  begin
+    initstr(rawbytestring(s1251),1251,utf8data);
+    w:=UTF8ToString(s1251);
+    if w<>utf8data_in_utf16 then
+      error(1,w);
+
+    initstr(rs,0,utf8data);
+    w:=UTF8ToString(rs);
+    if w<>utf8data_in_utf16 then
+      error(2,w);
+
+    initstr(rawbytestring(utf8),CP_UTF8,utf8data);
+    w:=UTF8ToString(utf8);
+    if w<>utf8data_in_utf16 then
+      error(3,w);
+
+    initstr(rawbytestring(s),defaultsystemcodepage,utf8data);
+    w:=UTF8ToString(s);
+    if w<>utf8data_in_utf16 then
+      error(4,w);
+
+    initstr(ss,utf8data);
+    w:=UTF8ToString(ss);
+    if w<>utf8data_in_utf16 then
+      error(5,w);
+
+    initarray(@bcc[0],utf8data);
+    bcc[high(bcc)]:=#0;
+    w:=UTF8ToString(@bcc[0]);
+    if w<>utf8data_in_utf16 then
+      error(6,w);
+
+{$ifndef cpujvm}
+    initarray(@ba[0],utf8data);
+    w:=UTF8ToString(ba);
+    if w<>utf8data_in_utf16 then
+      error(7,w);
+
+    initarray(@bc[0],utf8data);
+    w:=UTF8ToString(bc);
+    if w<>utf8data_in_utf16 then
+      error(8,w);
+{$endif not cpujvm}
+  end;
+
+
+procedure testinvalidutf8;
+  var
+    s1251: tstr1251;
+    rs: rawbytestring;
+    utf8: utf8string;
+    s: ansistring;
+    ss: shortstring;
+    ba: array[low(invalidutf8data)..high(invalidutf8data)] of byte;
+    bc: array[low(invalidutf8data)..high(invalidutf8data)] of ansichar;
+    bcc: array[low(invalidutf8data)..high(invalidutf8data)+1] of ansichar;
+    w: unicodestring;
+  begin
+    initstr(rawbytestring(s1251),1251,invalidutf8data);
+    w:=UTF8ToString(s1251);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(11,w);
+
+    initstr(rs,0,invalidutf8data);
+    w:=UTF8ToString(rs);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(12,w);
+
+    initstr(rawbytestring(utf8),CP_UTF8,invalidutf8data);
+    w:=UTF8ToString(utf8);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(13,w);
+
+    initstr(rawbytestring(s),defaultsystemcodepage,invalidutf8data);
+    w:=UTF8ToString(s);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(14,w);
+
+    initstr(ss,invalidutf8data);
+    w:=UTF8ToString(ss);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(15,w);
+
+    initarray(@bcc[0],invalidutf8data);
+    bcc[high(bcc)]:=#0;
+    w:=UTF8ToString(@bcc[0]);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(16,w);
+
+{$ifndef cpujvm}
+    initarray(@ba[0],invalidutf8data);
+    w:=UTF8ToString(ba);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(17,w);
+
+    initarray(@bc[0],invalidutf8data);
+    w:=UTF8ToString(bc);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(18,w);
+{$endif not cpujvm}
+  end;
+
+
+begin
+  testvalidutf8;
+  testinvalidutf8;
+end.

+ 196 - 0
tests/webtbs/tw29585.pp

@@ -0,0 +1,196 @@
+program tw29585;
+{$IFDEF FPC}
+{$MODE OBJFPC}{$H+}
+{$ELSE}
+{$APPTYPE Console}
+{$ENDIF}
+
+uses
+  {$ifndef FPC}Windows,{$endif}Sysutils;
+
+{$IFNDEF FPC}
+type
+  tsystemcodepage = word;
+{$ENDIF}
+
+Type
+  tstr1251 = type ansistring(1251);
+
+const
+  utf8data: array[0..10] of ansichar = #$C3#$A9#$C2#$BA#$C3#$AE#$C5#$93#$E2#$88#$82;
+  utf8data_in_utf16: unicodestring = #$00E9#$00BA#$00EE#$0153#$2202;
+
+  invalidutf8data: array[0..3] of ansichar = #$80#$81#$82#$83;
+  invalidutf8data_utf_16a: unicodestring = '????';
+  invalidutf8data_utf_16b: unicodestring = #$fffd#$fffd#$fffd#$fffd;
+
+
+procedure error(l: longint; const u: unicodestring);
+  var
+    i: longint;
+  begin
+    writeln('error for test ',l);
+    write('result: ');
+    for i:=low(u) to high(u) do
+      write('#$',inttohex(ord(u[i]),2));
+    writeln;
+    halt(l);
+  end;
+
+
+procedure initarray(p: pbyte; const data: array of ansichar);
+  var
+    i: longint;
+  begin
+    for i:=low(data) to high(data) do
+      p[i]:=ord(data[i]);
+  end;
+
+
+procedure initstr(var s: rawbytestring; cp: tsystemcodepage; const data: array of ansichar); overload;
+  var
+    i: longint;
+  begin
+    setlength(s,length(data));
+    setcodepage(s,cp,false);
+    for i:=low(data) to high(data) do
+      s[i+1]:=data[i];
+  end;
+
+
+procedure initstr(var s: shortstring; const data: array of ansichar); overload;
+  var
+    i: longint;
+  begin
+    setlength(s,length(data));
+    for i:=low(data) to high(data) do
+      s[i+1]:=data[i];
+  end;
+
+
+procedure testvalidutf8;
+  var
+    s1251: tstr1251;
+    rs: rawbytestring;
+    utf8: utf8string;
+    s: ansistring;
+    ss: shortstring;
+    ba: array[low(utf8data)..high(utf8data)] of byte;
+    bc: array[low(utf8data)..high(utf8data)] of ansichar;
+    bcc: array[low(utf8data)..high(utf8data)+1] of ansichar;
+    w: unicodestring;
+  begin
+    initstr(rawbytestring(s1251),1251,utf8data);
+    w:=UTF8ToString(s1251);
+    if w<>utf8data_in_utf16 then
+      error(1,w);
+
+    initstr(rs,0,utf8data);
+    w:=UTF8ToString(rs);
+    if w<>utf8data_in_utf16 then
+      error(2,w);
+
+    initstr(rawbytestring(utf8),CP_UTF8,utf8data);
+    w:=UTF8ToString(utf8);
+    if w<>utf8data_in_utf16 then
+      error(3,w);
+
+    initstr(rawbytestring(s),defaultsystemcodepage,utf8data);
+    w:=UTF8ToString(s);
+    if w<>utf8data_in_utf16 then
+      error(4,w);
+
+    initstr(ss,utf8data);
+    w:=UTF8ToString(ss);
+    if w<>utf8data_in_utf16 then
+      error(5,w);
+
+    initarray(@bcc[0],utf8data);
+    bcc[high(bcc)]:=#0;
+    w:=UTF8ToString(@bcc[0]);
+    if w<>utf8data_in_utf16 then
+      error(6,w);
+
+{$ifndef cpujvm}
+    initarray(@ba[0],utf8data);
+    w:=UTF8ToString(ba);
+    if w<>utf8data_in_utf16 then
+      error(7,w);
+
+    initarray(@bc[0],utf8data);
+    w:=UTF8ToString(bc);
+    if w<>utf8data_in_utf16 then
+      error(8,w);
+{$endif not cpujvm}
+  end;
+
+
+procedure testinvalidutf8;
+  var
+    s1251: tstr1251;
+    rs: rawbytestring;
+    utf8: utf8string;
+    s: ansistring;
+    ss: shortstring;
+    ba: array[low(invalidutf8data)..high(invalidutf8data)] of byte;
+    bc: array[low(invalidutf8data)..high(invalidutf8data)] of ansichar;
+    bcc: array[low(invalidutf8data)..high(invalidutf8data)+1] of ansichar;
+    w: unicodestring;
+  begin
+    initstr(rawbytestring(s1251),1251,invalidutf8data);
+    w:=UTF8ToString(s1251);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(11,w);
+
+    initstr(rs,0,invalidutf8data);
+    w:=UTF8ToString(rs);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(12,w);
+
+    initstr(rawbytestring(utf8),CP_UTF8,invalidutf8data);
+    w:=UTF8ToString(utf8);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(13,w);
+
+    initstr(rawbytestring(s),defaultsystemcodepage,invalidutf8data);
+    w:=UTF8ToString(s);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(14,w);
+
+    initstr(ss,invalidutf8data);
+    w:=UTF8ToString(ss);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(15,w);
+
+    initarray(@bcc[0],invalidutf8data);
+    bcc[high(bcc)]:=#0;
+    w:=UTF8ToString(@bcc[0]);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(16,w);
+
+{$ifndef cpujvm}
+    initarray(@ba[0],invalidutf8data);
+    w:=UTF8ToString(ba);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(17,w);
+
+    initarray(@bc[0],invalidutf8data);
+    w:=UTF8ToString(bc);
+    if (w<>invalidutf8data_utf_16a) and
+       (w<>invalidutf8data_utf_16b) then
+      error(18,w);
+{$endif not cpujvm}
+  end;
+
+
+begin
+  testvalidutf8;
+  testinvalidutf8;
+end.