Browse Source

Make Utf8CodepointLen adhere to the Unicode standard somewhat more and know all of the Unicode 15.0 combining characters.

Rika Ichinose 2 years ago
parent
commit
b38d13577f
2 changed files with 267 additions and 124 deletions
  1. 216 121
      rtl/inc/generic.inc
  2. 51 3
      tests/test/tutf8cpl.pp

+ 216 - 121
rtl/inc/generic.inc

@@ -1064,141 +1064,236 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
 {$endif not cpujvm}
 {$endif not cpujvm}
 
 
 
 
+{ Combining codepoints are those belonging to one of the three "Mark" general categories.
+  UnicodeData.txt column 3 has M* for them: Mn, Mc, Me.
+
+  Using the table below, codepoint %...XXXXXXXX_YYYY_ZZZZZ can be classified as combining or not with a 3-level lookup:
+
+  if %...XXXXXXXX <= High(IsCombinings.L2) then
+  begin
+    index := IsCombinings.L2[%XXXXXXXX];
+    index := IsCombinings.L1[index][%YYYY];
+    IsCombining := boolean(IsCombinings.L0[index] shr %ZZZZZ and 1);
+  end else
+    IsCombining := false;
+
+  Equivalent one-liner:
+
+  IsCombining := (%...XXXXXXXX <= High(IsCombinings.L2)) and (IsCombinings.L0[IsCombinings.L1[IsCombinings.L2[%XXXXXXXX]][%YYYY]] shr %ZZZZZ and 1 <> 0);
+
+  Additionally, there is a combining range U+E0100..U+E01EF far to the right, not included into the table to save 1 level.
+
+  Table built from UnicodeData.txt 15.0.0 (September 2022). }
+
+const
+  IsCombinings: record
+    L2: array[0 .. 244] of uint8;
+    L1: array[0 .. 46, 0 .. 15] of uint8;
+    L0: array[0 .. 161] of uint32;
+  end =
+  (
+    L2: (
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, {10} 0, 10, 11, 12, 13, 0, 14, 0, 0, 0, {20} 0, 0, 15, 0, 16, 0, 0, 0, 0, 0, {30} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      {40} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {50} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {60} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {70} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {80} 0,
+      0, 0, 17, 18, 19, 0, 0, 0, 0, {90} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {100} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {110} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {120} 0, 0,
+      0, 0, 0, 20, 0, 21, 22, 23, {130} 0, 0, 0, 24, 25, 26, 27, 28, 29, 30, {140} 31, 32, 33, 34, 0, 0, 0, 0, 0, 0, {150} 0, 0, 0, 0, 35, 0, 0, 0, 0, 0,
+      {160} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {170} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {180} 0, 36, 0, 37, 0, 0, 0, 0, 0, 0, {190} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      {200} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {210} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {220} 0, 0, 38, 0, 0, 0, 0, 0, 0, 0, {230} 0, 39, 40, 41, 0, 0, 0, 42, 0, 0,
+      {240} 43, 44, 45, 0, 46
+    );
+    L1: (
+      {0} (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),               (0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 0, 0, 0),
+      {2} (0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0),               (7, 0, 8, 9, 0, 0, 10, 11, 12, 13, 14, 0, 0, 15, 0, 16),
+      {4} (17, 18, 19, 0, 20, 0, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), (27, 28, 31, 32, 27, 28, 33, 34, 27, 28, 35, 26, 36, 37, 38, 0),
+      {6} (39, 28, 40, 26, 27, 28, 40, 41, 23, 42, 43, 26, 27, 0, 44, 45), (0, 46, 47, 0, 0, 48, 49, 0, 50, 51, 0, 4, 52, 53, 54, 0),
+      {8} (0, 55, 56, 57, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),           (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0, 0),
+      {10} (0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 45, 45, 0, 62, 63, 0),        (64, 0, 0, 0, 65, 66, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0),
+      {12} (68, 0, 69, 70, 0, 13, 1, 1, 39, 62, 39, 71, 72, 73, 0, 74),   (0, 75, 0, 0, 0, 0, 76, 77, 0, 0, 0, 0, 0, 0, 1, 1),
+      {14} (0, 0, 0, 0, 0, 0, 13, 1, 0, 0, 0, 0, 0, 0, 0, 0),             (0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 79, 0, 0, 0, 1),
+      {16} (0, 80, 0, 0, 81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),            (0, 0, 0, 82, 37, 0, 0, 83, 0, 0, 0, 0, 0, 0, 0, 0),
+      {18} (84, 85, 0, 0, 86, 62, 87, 88, 0, 89, 90, 0, 23, 91, 92, 93),  (0, 94, 95, 96, 0, 97, 98, 99, 0, 0, 0, 0, 0, 0, 0, 100),
+      {20} (0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0, 0),            (2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+      {22} (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102),            (0, 0, 0, 0, 0, 0, 0, 92, 0, 0, 0, 103, 0, 0, 0, 0),
+      {24} (104, 105, 0, 0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0),         (0, 0, 0, 0, 0, 0, 0, 0, 0, 106, 0, 0, 0, 0, 0, 0),
+      {26} (0, 0, 0, 0, 0, 107, 0, 59, 0, 0, 15, 0, 108, 0, 0, 0),        (72, 20, 109, 110, 72, 7, 36, 0, 72, 111, 65, 112, 72, 91, 113, 0),
+      {28} (0, 114, 98, 0, 0, 0, 79, 14, 23, 42, 29, 115, 0, 0, 0, 0),    (0, 116, 117, 0, 0, 13, 23, 0, 0, 0, 0, 0, 0, 118, 119, 0),
+      {30} (0, 13, 92, 0, 0, 120, 0, 0, 59, 121, 0, 0, 0, 0, 0, 0),       (0, 122, 0, 0, 0, 0, 0, 0, 0, 123, 124, 0, 0, 0, 125, 126),
+      {32} (127, 128, 129, 0, 130, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),      (0, 131, 0, 0, 132, 133, 0, 0, 0, 134, 135, 0, 136, 0, 0, 0),
+      {34} (0, 0, 0, 0, 0, 0, 0, 137, 138, 139, 72, 0, 0, 0, 0, 0),       (0, 0, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+      {36} (0, 0, 0, 0, 0, 0, 0, 141, 0, 142, 0, 0, 0, 0, 0, 0),          (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 143, 1, 144, 0, 0, 145),
+      {38} (0, 0, 0, 0, 146, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),            (0, 0, 0, 0, 0, 0, 0, 0, 1, 147, 109, 0, 0, 0, 0, 0),
+      {40} (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 148, 149, 150, 0, 0),        (0, 0, 151, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+      {42} (1, 152, 1, 153, 154, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),      (156, 157, 0, 0, 158, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0),
+      {44} (0, 0, 0, 0, 0, 159, 0, 160, 0, 0, 0, 0, 0, 0, 0, 0),          (0, 0, 0, 0, 0, 0, 0, 160, 0, 0, 0, 0, 0, 0, 0, 0),
+      {46} (0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 161, 0, 0, 0, 0, 0)
+    );
+    L0: (
+      {0} $00000000,   $FFFFFFFF, $0000FFFF, $000003F8, $FFFE0000, $BFFFFFFF, $000000B6, $07FF0000, $FFFFF800, $00010000, $9FC00000, $00003D9F, $00020000,
+      {13} $FFFF0000,  $000007FF, $0001FFC0, $200FF800, $FBC00000, $00003EEF, $0E000000, $FF000000, $FFFFFC00, $FFFFFFFB, $0000000F, $DC000000, $00FEFFFF,
+      {26} $0000000C,  $0000000E, $D0000000, $0080399F, $4000000C, $00023987, $00230000, $00003BBF, $FC00000C, $00E0399F, $00000004, $C0000000, $00803DC7,
+      {39} $0000001F,  $00603DDF, $0008000C, $D8000000, $00803DDF, $FF5F8400, $000C0000, $07F20000, $00007F80, $1FF20000, $00007F00, $03000000, $C2A00000,
+      {52} $FEFFE0DF,  $1FFFFFFF, $00000040, $7FFFF800, $C3C00000, $001E3F9D, $3C00BFFC, $E0000000, $003C0000, $001C0000, $FFF00000, $200FFFFF, $0000B800,
+      {65} $00000060,  $00000200, $0FFF0FFF, $0F800000, $7FE00000, $9FFFFFFF, $000FF800, $00000007, $00003FFE, $000FFFC0, $00FFFFF0, $FFF70000, $039021FF,
+      {78} $00038000,  $80000000, $0000FC00, $06000000, $3FF78000, $00030000, $00000844, $000010F8, $00000003, $0000003F, $8003FFFF, $00003FC0, $000FFF80,
+      {91} $FFF80000,  $00000001, $00000020, $007FFE00, $00003008, $38000000, $C19D0000, $00000002, $0060F800, $000037F8, $40000000, $20000000, $07C00000,
+      {104} $0000F06E, $87000000, $000000F0, $00001800, $0000003C, $0000007F, $80190000, $001FFF80, $00080000, $0000DE01, $40FFF000, $001F1FCC, $FFE00000,
+      {117} $4000007F, $FF3F8000, $30000001, $00FFF800, $00000FFF, $07FFF000, $79BF0000, $0000000D, $FCFE0000, $00000011, $000007FE, $7BF80000, $0FFE0080,
+      {130} $03FFFC00, $FF7F8000, $FFFC0000, $007FFEFF, $B47E0000, $000000BF, $00FB7C00, $00780000, $0000000B, $C7F00000, $003FFF81, $001F0000, $007F0000,
+      {143} $FFFE8000, $000780FF, $00030010, $60000000, $FFFF3FFF, $F807E3E0, $00000FE7, $00003C00, $0000001C, $F87FFFFF, $00201FFF, $F8000010, $0000FFFE,
+      {156} $F9FFFF7F, $000007DB, $00008000, $00004000, $0000F000, $000007F0
+    );
+  );
+
 function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
 function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
   var
   var
-    bytes: sizeint;
-    firstzerobit: byte;
+    cp: uint32;
+    iByte,cpLen: SizeInt;
   begin
   begin
     { see https://en.wikipedia.org/wiki/UTF-8#Description for details }
     { see https://en.wikipedia.org/wiki/UTF-8#Description for details }
+    result:=0;
 
 
-    if maxlookahead<=0 then
-      begin
-        { incomplete }
-        result:=0;
+    { result = 0 when scanning first character, result > 0 when scanning potential diacritical marks following it.
+
+      Common case is correct UTF-8.
+
+      Setting cpLen and breaking from the loop (instead of exiting) will handle invalid/incomplete cases
+      when cpLen bytes were expected, but not all are present/valid.
+      This keeps the code more compact, both source and binary. }
+
+    repeat
+      if MaxLookAhead<1 then
         exit;
         exit;
-      end;
-    { include the first byte }
-    result:=1;
-    { multiple byte utf-8 code point? }
-    if p[0]>#127 then
-      begin
-        { bsr searches for the leftmost 1 bit. We are interested in the
-          leftmost 0 bit, so first invert the value
-        }
-        firstzerobit:=bsrbyte(not(byte(p[0])));
-        { if there is no zero bit or the first zero bit is the rightmost bit
-          (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
-          UTF-8-encoded string, and in the worst case bit 1 has to be zero)
-          Additionally, 5-byte UTF-8 sequences don't exist either, so bit 1
-          cannot be the first zero-bit either. And bits 6 and 7 can't be 0
-          either in the first byte.
-        }
-        if (firstzerobit<=1) or (firstzerobit>=6)  then
-          begin
-            result:=-result;
-            exit;
-          end;
-        { the number of bytes belonging to this code point is
-          7-(pos first 0-bit). Subtract 1 since we're already at the first
-          byte. All subsequent bytes of the same sequence must have their
-          highest bit set and the next one unset. We stop when we detect an
-          invalid sequence.
-        }
-        bytes:=6-firstzerobit;
-        while (result<maxlookahead) and
-              (bytes>0) and
-              ((ord(p[result]) and %11000000)=%10000000) do
-          begin
-            inc(result);
-            dec(bytes);
-          end;
-        { stopped because of invalid/incomplete sequence -> exit }
-        if bytes<>0 then
-          begin
-            if result>=maxlookahead then
-              result:=0
-            else
-              result:=-result;
+
+      case ord(P[0]) of
+        { One-byte codepoints have the form
+          %(0)xxxxxxx. }
+
+        0..$7F {%01111111}:
+          { There are no diacritics among them. }
+          if not IncludeCombiningDiacriticalMarks then
+            exit(1)
+          else if result=0 then
+            begin
+              result:=1;
+              Inc(P);
+              Dec(MaxLookAhead);
+            end
+          else
             exit;
             exit;
-          end;
-      end;
-    if includecombiningdiacriticalmarks then
-      begin
-        { combining diacritical marks?
-            1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111
-            2) U+1AB0 - U+1AFF in UTF-8 = %11100001 10101010 10110000 - %11100001 10101011 10111111
-            3) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111
-            4) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111
-            5) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111
-        }
-        repeat
-          bytes:=result;
-          if result+1<maxlookahead then
+
+        { Two-byte codepoints have the form
+          %(110)xxxxx (10)xxxxxx.
+
+          but also minimum value of $80 = %10000000 =
+          %(110)00010 (10)000000. }
+
+        $C2 {%11000010}..$DF {%11011111}:
+          if (MaxLookAhead>=2) and (ord(P[1]) and $C0=$80) then
+            begin
+              if not IncludeCombiningDiacriticalMarks then
+                exit(2);
+              if result>0 then
+                begin
+                  cp:=ord(P[0]) and $1F {%11111} shl 6 or ord(P[1]) and $3F {%111111};
+                  { Max possible cp value, $7FF, won't overflow L2. }
+                  if IsCombinings.L0[IsCombinings.L1[IsCombinings.L2[cp shr (5+4)]][cp shr 5 and (1 shl 4-1)]] shr (cp and (1 shl 5-1)) and 1=0 then
+                    exit;
+                end;
+              Inc(result,2);
+              Inc(P,2);
+              Dec(MaxLookAhead,2);
+            end
+          else
+            begin
+              cpLen:=2;
+              break;
+            end;
+
+        { Three-byte codepoints have the form
+          %(1110)xxxx (10)xxxxxx (10)xxxxxx
+
+          but also minimum value of $800 = %1000 00000000 =
+          %(1110)0000 (10)100000 (10)000000. }
+
+        $E0 {%11100000}..$EF {%11101111}:
+          if (MaxLookAhead>=3) and (ord(P[1]) and $C0=$80) and (ord(P[2]) and $C0=$80) and ((ord(P[0])>$E0 {%11100000}) or (ord(P[1])>=$A0 {%10100000})) then
+            begin
+              if not IncludeCombiningDiacriticalMarks then
+                exit(3);
+              if result>0 then
+                begin
+                  cp:=ord(P[0]) and $F {%1111} shl 12 or ord(P[1]) and $3F {%111111} shl 6 or ord(P[2]) and $3F {%111111};
+                  { Max possible cp value, $FFFF, won't overflow L2. }
+                  if IsCombinings.L0[IsCombinings.L1[IsCombinings.L2[cp shr (5+4)]][cp shr 5 and (1 shl 4-1)]] shr (cp and (1 shl 5-1)) and 1=0 then
+                    exit;
+                end;
+              Inc(result,3);
+              Inc(P,3);
+              Dec(MaxLookAhead,3);
+            end
+          else
+            begin
+              cpLen:=3;
+              break;
+            end;
+
+        { Four-byte codepoints have the form
+          %(11110)xxx (10)xxxxxx (10)xxxxxx (10)xxxxxx
+
+          but also minimum value of $10000 = %1 00000000 00000000 =
+          %(11110)000 (10)010000 (10)000000 (10)000000
+
+          and maximum of $10FFFF = %10000 11111111 11111111 =
+          %(11110)100 (10)001111 (10)111111 (10)111111. }
+
+        $F0 {%11110000}..$F4 {%11110100}:
+          if (MaxLookAhead>=4) and (ord(P[1]) and $C0=$80) and (ord(P[2]) and $C0=$80) and (ord(P[3]) and $C0=$80) and
+             (uint16(P[0]) shl 8 or ord(P[1])>=$F090 {%11110000 10010000}) and
+             (uint16(P[0]) shl 8 or ord(P[1])<=$F48F {%11110100 10001111}) then
             begin
             begin
-              { case 1) }
-              if ((ord(p[result]) and %11001100=%11001100)) and
-                  (ord(p[result+1])>=%10000000) and
-                  (ord(p[result+1])<=%10101111) then
-                inc(result,2)
-                  { case 2), 3), 4), 5) }
-              else if (result+2<maxlookahead) and
-                 (ord(p[result])>=%11100001) then
+              if not IncludeCombiningDiacriticalMarks then
+                exit(4);
+              if result>0 then
                 begin
                 begin
-                     { case 2) }
-                  if ((ord(p[result])=%11100001) and
-                      (ord(p[result+1])=%10101010) and
-                      (ord(p[result+2])>=%10110000) and
-                      (ord(p[result+2])<=%10111111)) or
-                     { case 3) }
-                     ((ord(p[result])=%11100001) and
-                      (ord(p[result+1])=%10110111) and
-                      (ord(p[result+2])>=%10000000) and
-                      (ord(p[result+2])<=%10111111)) or
-                     { case 4) }
-                     ((ord(p[result])=%11100010) and
-                      (ord(p[result+1])=%10000011) and
-                      (ord(p[result+2])>=%10010000) and
-                      (ord(p[result+2])<=%10111111)) or
-                     { case 5) }
-                     ((ord(p[result])=%11101111) and
-                      (ord(p[result+1])=%10111000) and
-                      (ord(p[result+2])>=%10100000) and
-                      (ord(p[result+2])<=%10101111)) then
-                    inc(result,3);
+                  cp:=ord(P[0]) and $7 {%111} shl 18 or ord(P[1]) and $3F {%111111} shl 12 or ord(P[2]) and $3F {%111111} shl 6 or ord(P[3]) and $3F {%111111};
+                  { This time, cp can overflow L2, and can have special-cased values U+E0100..U+E01EF. }
+                  if cp<length(IsCombinings.L2) shl (5+4) then
+                    begin
+                      if IsCombinings.L0[IsCombinings.L1[IsCombinings.L2[cp shr (5+4)]][cp shr 5 and (1 shl 4-1)]] shr (cp and (1 shl 5-1)) and 1=0 then
+                        exit;
+                    end
+                  else if not ((cp>=$E0100) and (cp<=$E01EF)) then
+                    exit;
                 end;
                 end;
+              Inc(result,4);
+              Inc(P,4);
+              Dec(MaxLookAhead,4);
+            end
+          else
+            begin
+              cpLen:=4;
+              break;
             end;
             end;
-        until bytes=result;
-        { is there an incomplete diacritical mark? (invalid makes little sense:
-          either a sequence is a combining diacritical mark, or it's not ; if
-          it's invalid, it may also not have been a combining diacritical mark)
-        }
-        if result<maxlookahead then
+        else
           begin
           begin
-               { case 1) }
-            if (((ord(p[result]) and %11001100=%11001100)) and
-                (result+1>=maxlookahead)) or
-               { case 2) and 3)}
-               ((ord(p[result])=%11100001) and
-                ((result+1>=maxlookahead) or
-                 (((ord(p[result+1])=%10101010) or
-                   (ord(p[result+1])=%10110111)) and
-                  (result+2>=maxlookahead)))) or
-               { case 4 }
-               ((ord(p[result])=%11100010) and
-                ((result+1>=maxlookahead) or
-                 ((ord(p[result+1])=%10000011) and
-                  (result+2>=maxlookahead)))) or
-               { case 5 }
-               ((ord(p[result])=%11101111) and
-                ((result+1>=maxlookahead) or
-                 ((ord(p[result+1])=%10111000) and
-                  (result+2>=maxlookahead)))) then
-              begin
-                result:=0;
-                exit;
-              end;
+            cpLen:=1;
+            break;
           end;
           end;
       end;
       end;
+    until false;
+
+    { Handle invalid or incomplete cases, when expected codepoint length is cpLen. }
+    for iByte:=1 to cpLen-1 do
+      if (iByte<MaxLookAhead) and (ord(P[iByte]) and $C0 {%11000000}<>$80 {%10000000}) then
+        begin
+          if result=0 then result:=-1-iByte;
+          exit;
+        end;
+
+    if cpLen>MaxLookAhead then
+      result:=0 { Signal an incomplete codepoint, even if there were complete codepoints before. }
+    else if result=0 then
+       result:=-cpLen;
   end;
   end;
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR}
 {$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR}

+ 51 - 3
tests/test/tutf8cpl.pp

@@ -8,7 +8,7 @@ procedure check(index, lookahead: longint; combiningdiacritics: boolean; expecte
 begin
 begin
   if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
   if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
     begin
     begin
-      writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,false),' <> expected ',expectedresult);
+      writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics),' <> expected ',expectedresult);
       halt(1);
       halt(1);
     end;
     end;
 end;
 end;
@@ -143,6 +143,54 @@ begin
   check(5,2,false,0,92);
   check(5,2,false,0,92);
   check(5,2,true,0,93);
   check(5,2,true,0,93);
   { 2) invalid }
   { 2) invalid }
-  check(5,3,false,-2,94);
-  check(5,3,true,-2,95);
+  check(5,3,false,-3,94);
+  check(5,3,true,-3,95);
+
+  { Last allowed 4-byte codepoint, U+10FFFF. }
+  name:=#$f4#$8f#$bf#$bf;
+  check(1,4,false,4,96);
+
+  { Last allowed 4-byte codepoint + 1, U+110000. }
+  name:=#$f4#$90#$80#$80;
+  check(1,4,false,-4,97);
+
+  { First 5-byte codepoint, U+200000. }
+  name:=#$f8#$88#$80#$80#$80;
+  check(1,5,false,-1,98);
+
+  { Overlong 2-byte U+7F. }
+  name:=#$c1#$bf;
+  check(1,2,false,-1,99);
+
+  { Overlong 3-byte NULL. }
+  name:=#$e0#$80#$80;
+  check(1,3,false,-3,100);
+
+  { Overlong 4-byte U+FFFF. }
+  name:=#$f0#$8f#$bf#$bf;
+  check(1,4,false,-4,101);
+
+  { Cyrillic A + U+1AFF (last in the combining range 1AB0..1AFF). }
+  name:='А᫿';
+  check(1,5,true,5,102);
+
+  { Cyrillic A + U+1B00 (character just to the right of the combining range 1AB0..1AFF that happens to be combining anyway :D) }
+  name:='Аᬀ';
+  check(1,5,true,5,103);
+
+  { Cyrillic A + U+33F COMBINING DOUBLE OVERLINE. }
+  name:='А̿';
+  check(1,4,true,4,104);
+
+  { Cyrillic A + U+3099 (kana voice mark, 3-byte combining character outside of five ranges). }
+  name:='А゙';
+  check(1,5,true,5,105);
+
+  { Cyrillic A + U+1D167 (tremolo, 4-byte combining character outside of five ranges). }
+  name:='А𝅧';
+  check(1,6,true,6,106);
+
+  { Cyrillic A + U+E0100 (variation selector 17, 4-byte combining character outside of five ranges, special-cased in Utf8CodepointLen). }
+  name:='А󠄀';
+  check(1,6,true,6,107);
 end.
 end.