10 år sedan · ff020a3be4
--- a/.gitattributes
+++ b/.gitattributes
@@ -12392,6 +12392,7 @@ tests/test/tunit3.pp svneol=native#text/plain
 
				 tests/test/tunroll1.pp svneol=native#text/plain
			
 
				 tests/test/tutf81.pp svneol=native#text/plain
			
 
				 tests/test/tutf82.pp svneol=native#text/plain
			
 
				+tests/test/tutf8cpl.pp svneol=native#text/plain
			
 
				 tests/test/tvarpropsetter1.pp svneol=native#text/plain
			
 
				 tests/test/tvarpropsetter2.pp svneol=native#text/plain
			
 
				 tests/test/tvarset1.pp svneol=native#text/plain
			
--- a/rtl/inc/generic.inc
+++ b/rtl/inc/generic.inc
@@ -1076,6 +1076,144 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
 
				 
			
 
				 {$endif not cpujvm}
			
 
				 
			
 
				+
			
 
				+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
			
 
				+  var
			
 
				+    bytes: sizeint;
			
 
				+    firstzerobit: byte;
			
 
				+  begin
			
 
				+    { see https://en.wikipedia.org/wiki/UTF-8#Description for details }
			
 
				+
			
 
				+    if maxlookahead<=0 then
			
 
				+      begin
			
 
				+        { incomplete }
			
 
				+        result:=0;
			
 
				+        exit;
			
 
				+      end;
			
 
				+    { inlcude the first byte }
			
 
				+    result:=1;
			
 
				+    { multiple byte utf-8 code point? }
			
 
				+    if p[0]>#127 then
			
 
				+      begin
			
 
				+        { bsr searches for the leftmost 1 bit. We are interested in the
			
 
				+          leftmost 0 bit, so first invert the value
			
 
				+        }
			
 
				+        firstzerobit:=bsrbyte(not(byte(p[0])));
			
 
				+        { if there is no zero bit or the first zero bit is the rightmost bit
			
 
				+          (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
			
 
				+          UTF-8-encoded string, and in the worst case bit 1 has to be zero)
			
 
				+          Additionally, 5-byte UTF-8 sequences don't exist either, so bit 1
			
 
				+          cannot be the first zero-bit either. And bits 6 and 7 can't be 0
			
 
				+          either in the first byte.
			
 
				+        }
			
 
				+        if (firstzerobit<=1) or (firstzerobit>=6)  then
			
 
				+          begin
			
 
				+            result:=-result;
			
 
				+            exit;
			
 
				+          end;
			
 
				+        { the number of bytes belonging to this code point is
			
 
				+          7-(pos first 0-bit). Subtract 1 since we're already at the first
			
 
				+          byte. All subsequent bytes of the same sequence must have their
			
 
				+          highest bit set and the next one unset. We stop when we detect an
			
 
				+          invalid sequence.
			
 
				+        }
			
 
				+        bytes:=6-firstzerobit;
			
 
				+        while (result<maxlookahead) and
			
 
				+              (bytes>0) and
			
 
				+              ((ord(p[result]) and %11000000)=%10000000) do
			
 
				+          begin
			
 
				+            inc(result);
			
 
				+            dec(bytes);
			
 
				+          end;
			
 
				+        { stopped because of invalid/incomplete sequence -> exit }
			
 
				+        if bytes<>0 then
			
 
				+          begin
			
 
				+            if result>=maxlookahead then
			
 
				+              result:=0
			
 
				+            else
			
 
				+              result:=-result;
			
 
				+            exit;
			
 
				+          end;
			
 
				+      end;
			
 
				+    if includecombiningdiacriticalmarks then
			
 
				+      begin
			
 
				+        { combining diacritical marks?
			
 
				+            1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111
			
 
				+            2) U+1AB0 - U+1AFF in UTF-8 = %11100001 10101010 10110000 - %11100001 10101011 10111111
			
 
				+            3) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111
			
 
				+            4) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111
			
 
				+            5) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111
			
 
				+        }
			
 
				+        repeat
			
 
				+          bytes:=result;
			
 
				+          if result+1<maxlookahead then
			
 
				+            begin
			
 
				+              { case 1) }
			
 
				+              if ((ord(p[result]) and %11001100=%11001100)) and
			
 
				+                  (ord(p[result+1])>=%10000000) and
			
 
				+                  (ord(p[result+1])<=%10101111) then
			
 
				+                inc(result,2)
			
 
				+                  { case 2), 3), 4), 5) }
			
 
				+              else if (result+2<maxlookahead) and
			
 
				+                 (ord(p[result])>=%11100001) then
			
 
				+                begin
			
 
				+                     { case 2) }
			
 
				+                  if ((ord(p[result])=%11100001) and
			
 
				+                      (ord(p[result+1])=%10101010) and
			
 
				+                      (ord(p[result+2])>=%10110000) and
			
 
				+                      (ord(p[result+2])<=%10111111)) or
			
 
				+                     { case 3) }
			
 
				+                     ((ord(p[result])=%11100001) and
			
 
				+                      (ord(p[result+1])=%10110111) and
			
 
				+                      (ord(p[result+2])>=%10000000) and
			
 
				+                      (ord(p[result+2])<=%10111111)) or
			
 
				+                     { case 4) }
			
 
				+                     ((ord(p[result])=%11100010) and
			
 
				+                      (ord(p[result+1])=%10000011) and
			
 
				+                      (ord(p[result+2])>=%10010000) and
			
 
				+                      (ord(p[result+2])<=%10111111)) or
			
 
				+                     { case 5) }
			
 
				+                     ((ord(p[result])=%11101111) and
			
 
				+                      (ord(p[result+1])=%10111000) and
			
 
				+                      (ord(p[result+2])>=%10100000) and
			
 
				+                      (ord(p[result+2])<=%10101111)) then
			
 
				+                    inc(result,3);
			
 
				+                end;
			
 
				+            end;
			
 
				+        until bytes=result;
			
 
				+        { is there an incomplete diacritical mark? (invalid makes little sense:
			
 
				+          either a sequence is a combining diacritical mark, or it's not ; if
			
 
				+          it's invalid, it may also not have been a combining diacritical mark)
			
 
				+        }
			
 
				+        if result<maxlookahead then
			
 
				+          begin
			
 
				+               { case 1) }
			
 
				+            if (((ord(p[result]) and %11001100=%11001100)) and
			
 
				+                (result+1>=maxlookahead)) or
			
 
				+               { case 2) and 3)}
			
 
				+               ((ord(p[result])=%11100001) and
			
 
				+                ((result+1>=maxlookahead) or
			
 
				+                 (((ord(p[result+1])=%10101010) or
			
 
				+                   (ord(p[result+1])=%10110111)) and
			
 
				+                  (result+2>=maxlookahead)))) or
			
 
				+               { case 4 }
			
 
				+               ((ord(p[result])=%11100010) and
			
 
				+                ((result+1>=maxlookahead) or
			
 
				+                 ((ord(p[result+1])=%10000011) and
			
 
				+                  (result+2>=maxlookahead)))) or
			
 
				+               { case 5 }
			
 
				+               ((ord(p[result])=%11101111) and
			
 
				+                ((result+1>=maxlookahead) or
			
 
				+                 ((ord(p[result+1])=%10111000) and
			
 
				+                  (result+2>=maxlookahead)))) then
			
 
				+              begin
			
 
				+                result:=0;
			
 
				+                exit;
			
 
				+              end;
			
 
				+          end;
			
 
				+      end;
			
 
				+  end;
			
 
				+
			
 
				 {$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR}
			
 
				 
			
 
				 procedure fpc_chararray_to_shortstr(out res : shortstring;const arr: array of char; zerobased: boolean = true);[public,alias:'FPC_CHARARRAY_TO_SHORTSTR']; compilerproc;
			
--- a/rtl/inc/systemh.inc
+++ b/rtl/inc/systemh.inc
@@ -1081,6 +1081,14 @@ Function  Sseg:Word;{$ifdef SYSTEMINLINE}inline;{$endif}
 
				 function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
			
 
				 function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
			
 
				 
			
 
				+{ result:
			
 
				+  <0: invalid sequence detected after processing "-result" bytes
			
 
				+  0: incomplete (may still be valid if MaxLookAhead is increased)
			
 
				+  >0: sequence of result bytes forms a codepoint (+ combining diacritics if that
			
 
				+      parameter was true)
			
 
				+}
			
 
				+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
			
 
				+
			
 
				 { Shortstring functions }
			
 
				 Procedure Delete(var s:shortstring;index:SizeInt;count:SizeInt);
			
 
				 Procedure Insert(const source:shortstring;var s:shortstring;index:SizeInt);
			
--- a/rtl/java/jsystemh.inc
+++ b/rtl/java/jsystemh.inc
@@ -442,6 +442,15 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
 
				 function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
			
 
				 *)
			
 
				 
			
 
				+{ result:
			
 
				+  <0: invalid sequence detected after processing "-result" bytes
			
 
				+  0: incomplete (may still be valid if MaxLookAhead is increased)
			
 
				+  >0: sequence of result bytes forms a codepoint (+ combining diacritics if that
			
 
				+      parameter was true)
			
 
				+}
			
 
				+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
			
 
				+
			
 
				+
			
 
				 var
			
 
				   { separated compared to generic version, for Java type safety }
			
 
				   FPC_EMPTYANSICHAR : array[0..0] of ansichar;
			
--- a/tests/test/tutf8cpl.pp
+++ b/tests/test/tutf8cpl.pp
@@ -0,0 +1,148 @@
 
				+{$mode objfpc}
			
 
				+{$codepage utf8}
			
 
				+
			
 
				+var
			
 
				+  name: utf8string;
			
 
				+  
			
 
				+procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint);
			
 
				+begin
			
 
				+  if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
			
 
				+    begin
			
 
				+      writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,false),' <> expected ',expectedresult);
			
 
				+      halt(1);
			
 
				+    end;
			
 
				+end;
			
 
				+
			
 
				+begin
			
 
				+  name:='a';
			
 
				+  check(1,0,false,0,1);
			
 
				+  check(1,1,false,1,2);
			
 
				+  check(1,1,true,1,3);
			
 
				+  check(1,6,false,1,4);
			
 
				+  check(1,6,true,1,5);
			
 
				+  name:='ab';
			
 
				+  check(1,6,false,1,6);
			
 
				+  check(1,6,true,1,7);
			
 
				+  check(1,1,false,1,8);
			
 
				+  check(1,1,true,1,9);
			
 
				+  check(2,6,false,1,10);
			
 
				+  check(2,6,true,1,11);
			
 
				+  name:='é';
			
 
				+  check(1,1,false,0,12);
			
 
				+  check(1,1,true,0,13);
			
 
				+  check(1,2,false,2,14);
			
 
				+  check(1,2,true,2,15);
			
 
				+  check(2,1,false,-1,16);
			
 
				+  check(2,1,true,-1,17);
			
 
				+  check(2,3,false,-1,18);
			
 
				+  check(2,3,true,-1,19);
			
 
				+  name:='éa';
			
 
				+  check(1,1,false,0,20);
			
 
				+  check(1,1,true,0,21);
			
 
				+  check(1,2,false,2,22);
			
 
				+  check(1,2,true,2,23);
			
 
				+  check(2,1,false,-1,24);
			
 
				+  check(2,1,true,-1,25);
			
 
				+  check(2,3,false,-1,26);
			
 
				+  check(2,3,true,-1,27);
			
 
				+  check(3,1,false,1,28);
			
 
				+  check(3,1,true,1,29);
			
 
				+  check(3,4,false,1,30);
			
 
				+  check(3,4,true,1,31);
			
 
				+  name[3]:=name[2];
			
 
				+  check(1,1,false,0,32);
			
 
				+  check(1,1,true,0,33);
			
 
				+  check(1,2,false,2,34);
			
 
				+  check(1,2,true,2,35);
			
 
				+  check(2,1,false,-1,36);
			
 
				+  check(2,1,true,-1,37);
			
 
				+  check(2,3,false,-1,38);
			
 
				+  check(2,3,true,-1,39);
			
 
				+  check(3,1,false,-1,40);
			
 
				+  check(3,1,true,-1,41);
			
 
				+  check(3,4,false,-1,42);
			
 
				+  check(3,4,true,-1,43);
			
 
				+  { e + combining ` }
			
 
				+  name:='e'#$0300'b';
			
 
				+  { check just the e without accent }
			
 
				+  check(1,1,false,1,44);
			
 
				+  check(1,1,true,1,45);
			
 
				+  check(1,2,false,1,46);
			
 
				+  { partial diacritical mark }
			
 
				+  check(1,2,true,0,47);
			
 
				+  check(1,3,false,1,48);
			
 
				+  { complete diacritical mark }
			
 
				+  check(1,3,true,3,49);
			
 
				+  check(1,4,false,1,50);
			
 
				+  { complete diacritical mark (ignore extra character) }
			
 
				+  check(1,4,true,3,51);
			
 
				+  { start of combining diacritical mark -- treated as independent utf-8 codepoint }
			
 
				+  check(2,1,false,0,52);
			
 
				+  check(2,1,true,0,53);
			
 
				+  check(2,3,false,2,54);
			
 
				+  check(2,3,true,2,55);
			
 
				+  { middle of the combining diacritical mark }
			
 
				+  check(3,1,false,-1,56);
			
 
				+  check(3,1,true,-1,57);
			
 
				+  check(3,4,false,-1,58);
			
 
				+  check(3,4,true,-1,59);
			
 
				+  { corrupt diacritical mark = no diacritical mark }
			
 
				+  name[3]:=name[4];
			
 
				+  { partial diacritical mark (the corrupted byte is not included in the
			
 
				+    lookahead) }
			
 
				+  check(1,2,true,0,60);
			
 
				+  check(1,3,false,1,61);
			
 
				+  { ignore corrupt diacritical mark }
			
 
				+  check(1,3,true,1,62);
			
 
				+  check(1,4,false,1,63);
			
 
				+  check(1,4,true,1,64);
			
 
				+  { e + combining circle + combining superscript 'n' }
			
 
				+  name:='e'#$20DD#$1DE0'b';
			
 
				+  { partial diacritical mark }
			
 
				+  check(1,2,true,0,65);
			
 
				+  check(1,3,false,1,66);
			
 
				+  check(1,3,true,0,67);
			
 
				+  check(1,4,false,1,68);
			
 
				+  { complete diacritical mark }
			
 
				+  check(1,4,true,4,69);
			
 
				+  check(1,4,false,1,70);
			
 
				+  { partial second diacritical mark }
			
 
				+  check(1,5,true,0,71);
			
 
				+  check(1,5,false,1,72);
			
 
				+  check(1,6,true,0,73);
			
 
				+  check(1,6,false,1,74);
			
 
				+  { complete both diacritical marks }
			
 
				+  check(1,7,true,7,75);
			
 
				+  check(1,7,false,1,76);
			
 
				+  check(1,10,true,7,77);
			
 
				+  check(1,10,false,1,78);
			
 
				+  { complete both diacritical marks without first character }
			
 
				+  check(2,6,true,6,79);
			
 
				+  check(2,20,true,6,80);
			
 
				+  { only the first one, treated as independent codepoint }
			
 
				+  check(2,7,false,3,81);
			
 
				+  { corrupt second diacritical mark }
			
 
				+  name[7]:=name[8];
			
 
				+  { partial second diacritical mark }
			
 
				+  check(1,5,true,0,82);
			
 
				+  check(1,5,false,1,83);
			
 
				+  check(1,6,true,0,84);
			
 
				+  check(1,6,false,1,85);
			
 
				+  { including bad byte -> ignore second diacritical mark completely
			
 
				+    (can't know it's part of a diacritical mark or something else) }
			
 
				+  check(1,7,true,4,86);
			
 
				+  check(1,7,false,1,87);
			
 
				+  { complete both diacritical marks without first character,
			
 
				+    but with corrupted byte }
			
 
				+  check(2,7,true,3,88);
			
 
				+  check(2,20,true,3,89);
			
 
				+  { corrupted diacritical mark by itself }
			
 
				+  { 1) incomplete }
			
 
				+  check(5,1,false,0,90);
			
 
				+  check(5,1,true,0,91);
			
 
				+  check(5,2,false,0,92);
			
 
				+  check(5,2,true,0,93);
			
 
				+  { 2) invalid }
			
 
				+  check(5,3,false,-2,94);
			
 
				+  check(5,3,true,-2,95);
			
 
				+end.