Просмотр исходного кода

* implement support for 4 Byte UTF-8 codepoints that result in a surrogate pair for UTF-16

git-svn-id: trunk@36116 -
svenbarth 8 лет назад
Родитель
Сommit
c552b2957a
3 измененных файлов с 53 добавлено и 2 удалено
  1. 1 0
      .gitattributes
  2. 29 2
      compiler/scanner.pas
  3. 23 0
      tests/test/tcpstr28.pp

+ 1 - 0
.gitattributes

@@ -12291,6 +12291,7 @@ tests/test/tcpstr26b.pp svneol=native#text/plain
 tests/test/tcpstr26c.pp svneol=native#text/plain
 tests/test/tcpstr26d.pp svneol=native#text/plain
 tests/test/tcpstr27.pp svneol=native#text/plain
+tests/test/tcpstr28.pp svneol=native#text/pascal
 tests/test/tcpstr2a.pp svneol=native#text/plain
 tests/test/tcpstr3.pp svneol=native#text/plain
 tests/test/tcpstr4.pp svneol=native#text/plain

+ 29 - 2
compiler/scanner.pas

@@ -4586,6 +4586,7 @@ type
     procedure tscannerfile.readtoken(allowrecordtoken:boolean);
       var
         code    : integer;
+        d : cardinal;
         len,
         low,high,mid : longint;
         w : word;
@@ -5160,9 +5161,35 @@ type
                                    iswidestring:=true;
                                    len:=0;
                                  end;
-                               { four or more chars aren't handled }
+                               { four chars }
                                if (ord(c) and $f0)=$f0 then
-                                 message(scan_e_utf8_bigger_than_65535)
+                                 begin
+                                   { this always represents a surrogate pair, so
+                                     read as 32-bit value and then split into
+                                     the corresponding pair of two wchars }
+                                   d:=ord(c) and $f;
+                                   readchar;
+                                   if (ord(c) and $c0)<>$80 then
+                                     message(scan_e_utf8_malformed);
+                                   d:=(d shl 6) or (ord(c) and $3f);
+                                   readchar;
+                                   if (ord(c) and $c0)<>$80 then
+                                     message(scan_e_utf8_malformed);
+                                   d:=(d shl 6) or (ord(c) and $3f);
+                                   readchar;
+                                   if (ord(c) and $c0)<>$80 then
+                                     message(scan_e_utf8_malformed);
+                                   d:=(d shl 6) or (ord(c) and $3f);
+                                   if d<$10000 then
+                                     message(scan_e_utf8_malformed);
+                                   d:=d-$10000;
+                                   { high surrogate }
+                                   w:=$d800+(d shr 10);
+                                   concatwidestringchar(patternw,w);
+                                   { low surrogate }
+                                   w:=$dc00+(d and $3ff);
+                                   concatwidestringchar(patternw,w);
+                                 end
                                { three chars }
                                else if (ord(c) and $e0)=$e0 then
                                  begin

+ 23 - 0
tests/test/tcpstr28.pp

@@ -0,0 +1,23 @@
+program tcpstr28;
+
+{$codepage utf8}
+
+const
+  Test = '𝄞𤽜';
+  UTF8Test = UTF8String(Test);
+  UTF16Test = UnicodeString(Test);
+
+var
+  utf8str: UTF8String = Test;
+  utf16str: UnicodeString = Test;
+
+begin
+  if Length(UTF8Test) <> 8 then
+    Halt(1);
+  if Length(utf8str) <> 8 then
+    Halt(1);
+  if Length(UTF16Test) <> 4 then
+    Halt(1);
+  if Length(utf16str) <> 4 then
+    Halt(1);
+end.