18 years ago · 2319d8c3ce
--- a/.gitattributes
+++ b/.gitattributes
@@ -7299,6 +7299,7 @@ tests/test/twide1.pp svneol=native#text/plain
 
															 tests/test/twide2.pp svneol=native#text/plain
														
 
															 tests/test/twide3.pp svneol=native#text/plain
														
 
															 tests/test/twide4.pp svneol=native#text/plain
														
 
															+tests/test/twide5.pp svneol=native#text/plain
														
 
															 tests/test/twrstr1.pp svneol=native#text/plain
														
 
															 tests/test/twrstr2.pp svneol=native#text/plain
														
 
															 tests/test/twrstr3.pp svneol=native#text/plain
														
--- a/rtl/inc/wustrings.inc
+++ b/rtl/inc/wustrings.inc
@@ -1814,24 +1814,111 @@ function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inli
 
															   end;
														
 
															+{ converts an utf-16 code point or surrogate pair to utf-32 }
														
 
															+function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char;
														
 
															+var
														
 
															+  w: widechar;
														
 
															+begin
														
 
															+  { UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
														
 
															+  { are the same in UTF-32                                  }
														
 
															+  w:=s[index];
														
 
															+  if (w<=#$d7ff) or
														
 
															+     (w>=#$e000) then
														
 
															+    begin
														
 
															+      result:=UCS4Char(w);
														
 
															+      len:=1;
														
 
															+    end
														
 
															+  { valid surrogate pair? }
														
 
															+  else if (w<=#$dbff) and
														
 
															+          { w>=#$d7ff check not needed, checked above }
														
 
															+          (index<length(s)) and
														
 
															+          (s[index+1]>=#$dc00) and
														
 
															+          (s[index+1]<=#$dfff) then
														
 
															+      { convert the surrogate pair to UTF-32 }
														
 
															+    begin
														
 
															+      result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
														
 
															+      len:=2;
														
 
															+    end
														
 
															+  else
														
 
															+    { invalid surrogate -> do nothing }
														
 
															+    begin
														
 
															+      result:=UCS4Char(w);
														
 
															+      len:=1;
														
 
															+    end;
														
 
															+end;
														
 
															+
														
 
															+
														
 
															 function WideStringToUCS4String(const s : WideString) : UCS4String;
														
 
															   var
														
 
															-    i : SizeInt;
														
 
															+    i, slen,
														
 
															+    destindex : SizeInt;
														
 
															+    len       : longint;
														
 
															+    uch       : UCS4Char;
														
 
															   begin
														
 
															-    setlength(result,length(s)+1);
														
 
															-    for i:=1 to length(s) do
														
 
															-      result[i-1]:=UCS4Char(s[i]);
														
 
															-    result[length(s)]:=UCS4Char(0);
														
 
															+    slen:=length(s);
														
 
															+    setlength(result,slen+1);
														
 
															+    i:=1;
														
 
															+    destindex:=0;
														
 
															+    while (i<=slen) do
														
 
															+      begin
														
 
															+        result[destindex]:=utf16toutf32(s,i,len);
														
 
															+        inc(destindex);
														
 
															+        inc(i,len);
														
 
															+      end;
														
 
															+    result[destindex]:=UCS4Char(0);
														
 
															+    { destindex <= slen }
														
 
															+    setlength(result,destindex);
														
 
															   end;
														
 
															+{ concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
														
 
															+procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt);
														
 
															+var
														
 
															+  p : PWideChar;
														
 
															+begin
														
 
															+  { if nc > $ffff, we need two places }
														
 
															+  if (index+ord(nc > $ffff)>length(s)) then
														
 
															+    if (length(s) < 10*256) then
														
 
															+      setlength(s,length(s)+10)
														
 
															+    else
														
 
															+      setlength(s,length(s)+length(s) shr 8);
														
 
															+  { we know that s is unique -> avoid uniquestring calls}
														
 
															+  p:=@s[index];
														
 
															+  if (nc<$ffff) then
														
 
															+    begin
														
 
															+      p^:=widechar(nc);
														
 
															+      inc(index);
														
 
															+    end
														
 
															+  else if (nc<=$10ffff) then
														
 
															+    begin
														
 
															+      p^:=widechar((nc - $10000) shr 10 + $d800);
														
 
															+      (p+1)^:=widechar((nc - $10000) and $3ff + $dc00);
														
 
															+      inc(index,2);
														
 
															+    end
														
 
															+  else
														
 
															+    { invalid code point }
														
 
															+    begin
														
 
															+      p^:='?';
														
 
															+      inc(index);
														
 
															+    end;
														
 
															+end;
														
 
															+
														
 
															+
														
 
															 function UCS4StringToWideString(const s : UCS4String) : WideString;
														
 
															   var
														
 
															-    i : SizeInt;
														
 
															+    i, slen   : SizeInt;
														
 
															+    nc        : wint_t;
														
 
															+    resindex  : SizeInt;
														
 
															+    len       : longint;
														
 
															+    valid     : boolean;
														
 
															   begin
														
 
															-    setlength(result,length(s)-1);
														
 
															-    for i:=1 to length(s)-1 do
														
 
															-      result[i]:=WideChar(s[i-1]);
														
 
															+    SetLength(result,length(s));
														
 
															+    resindex:=1;
														
 
															+    for i:=0 to high(s) do
														
 
															+      ConcatUTF32ToWideStr(s[i],result,resindex);
														
 
															+    { adjust result length (may be too big due to growing }
														
 
															+    { for surrogate pairs)                                }
														
 
															+    setlength(result,resindex-1);
														
 
															   end;
														
--- a/tests/test/twide5.pp
+++ b/tests/test/twide5.pp
@@ -0,0 +1,44 @@
 
															+{$codepage utf-8}
														
 
															+
														
 
															+var
														
 
															+  ws: widestring;
														
 
															+  us: UCS4String;
														
 
															+begin
														
 
															+// the compiler does not yet support characters which require
														
 
															+// a surrogate pair in utf-16
														
 
															+//  ws:='éłŁćçŹ你';
														
 
															+//  so write the last character directly using a utf-16 surrogate pair
														
 
															+  ws:='éłŁćçŹ'#$d87e#$dc04;
														
 
															+
														
 
															+  if (length(ws)<>8) or
														
 
															+     (ws[1]<>'é') or
														
 
															+     (ws[2]<>'ł') or
														
 
															+     (ws[3]<>'Ł') or
														
 
															+     (ws[4]<>'ć') or
														
 
															+     (ws[5]<>'ç') or
														
 
															+     (ws[6]<>'Ź') or
														
 
															+     (ws[7]<>#$d87e) or
														
 
															+     (ws[8]<>#$dc04) then
														
 
															+    halt(1);
														
 
															+  us:=WideStringToUCS4String(ws);
														
 
															+  if (length(us)<>7) or
														
 
															+     (us[0]<>UCS4Char(widechar('é'))) or
														
 
															+     (us[1]<>UCS4Char(widechar('ł'))) or
														
 
															+     (us[2]<>UCS4Char(widechar('Ł'))) or
														
 
															+     (us[3]<>UCS4Char(widechar('ć'))) or
														
 
															+     (us[4]<>UCS4Char(widechar('ç'))) or
														
 
															+     (us[5]<>UCS4Char(widechar('Ź'))) or
														
 
															+     (us[6]<>UCS4Char($2F804)) then
														
 
															+    halt(2);
														
 
															+  ws:=UCS4StringToWideString(us);
														
 
															+  if (length(ws)<>8) or
														
 
															+     (ws[1]<>'é') or
														
 
															+     (ws[2]<>'ł') or
														
 
															+     (ws[3]<>'Ł') or
														
 
															+     (ws[4]<>'ć') or
														
 
															+     (ws[5]<>'ç') or
														
 
															+     (ws[6]<>'Ź') or
														
 
															+     (ws[7]<>#$d87e) or
														
 
															+     (ws[8]<>#$dc04) then
														
 
															+    halt(3);
														
 
															+end.